In [6]:
import re
import gensim
from gensim import corpora, models, similarities

%matplotlib inline
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
import seaborn as sns  
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_pickle("data/hilary_sent.pkl")
df_cut = df[['Id','ExtractedBodyText']].dropna()

# Text Pre-processing 

In [9]:
def clean_email_text(text):
    text = text.replace('\n'," ") # Take out the paragraph space
    text = re.sub(r"-", " ", text) # Split the words with "-" (for example：pre-processing ==> pre processing）
    text = re.sub(r"\d+/\d+/\d+", "", text) # Take out the dates
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) # Take out the time
    text = re.sub(r"[\w]+@[\.\w]+", "", text) # Take out the emails
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) # Take out the websites
    pure_text = ''
    # Validate to check if there are any non-text content 
    for letter in text:
        # Keep only letters and spaces
        if letter.isalpha() or letter==' ':
            pure_text += letter
    # Join the words are not stand-alone letters
    text = ' '.join(word for word in pure_text.split() if len(word)>1)
    return text

In [13]:
docs = df_cut['ExtractedBodyText']
docs = docs.apply(lambda x: clean_email_text(x))

In [14]:
docs

4       Friday March PM Huma Abedin Fw Latest How Syri...
5       Pis print Wednesday September PM Fw Meet The R...
7       Friday March PM Huma Abedin Fw Latest How Syri...
20      Wednesday September PM Fw Fwd more on libya Li...
21      Pis print Wednesday September PM Fw Fwd more o...
                              ...                        
7824                                            Pls print
7825                                     Could we move to
7826    Did you andor PIR talk to him today Do need to...
7827    Pls prepare condolence letter to Justice RBG f...
7828                Wednesday June PM Fw confidential Fyi
Name: ExtractedBodyText, Length: 1882, dtype: object

In [15]:
stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 
            'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 
            'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 
            'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 
            'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 
            'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 
            'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 
            'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 
            'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 
            'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 
            'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 
            'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which']

In [85]:
stop_words = ENGLISH_STOP_WORDS.union(['docx','fyi','fw','get','see','ok','pm','whose','would','pls','thx','yes','print','okay','pis'])

In [69]:
doclist = docs.values

texts = [[word for word in doc.lower().split() if word not in stop_words] for doc in doclist]

# LDA Model

In [70]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [71]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)

In [72]:
lda.print_topics(num_topics=20, num_words=5)

[(0,
  '0.009*"want" + 0.008*"friday" + 0.008*"need" + 0.007*"memo" + 0.007*"sid"'),
 (1,
  '0.012*"pis" + 0.009*"im" + 0.007*"huma" + 0.007*"know" + 0.007*"tonight"'),
 (2,
  '0.010*"state" + 0.010*"like" + 0.008*"time" + 0.007*"want" + 0.007*"add"'),
 (3,
  '0.015*"im" + 0.013*"did" + 0.011*"discuss" + 0.009*"ops" + 0.009*"schedule"'),
 (4,
  '0.014*"press" + 0.013*"doc" + 0.013*"talk" + 0.012*"dialogue" + 0.012*"clips"'),
 (5, '0.013*"know" + 0.009*"week" + 0.008*"lets" + 0.006*"id" + 0.005*"like"'),
 (6,
  '0.025*"pis" + 0.014*"release" + 0.012*"tomorrow" + 0.009*"know" + 0.009*"thought"'),
 (7,
  '0.014*"pis" + 0.012*"state" + 0.011*"list" + 0.009*"tomorrow" + 0.009*"did"'),
 (8,
  '0.012*"tomorrow" + 0.011*"great" + 0.009*"work" + 0.008*"good" + 0.007*"sounds"'),
 (9, '0.011*"know" + 0.009*"good" + 0.009*"im" + 0.008*"lets" + 0.008*"talk"'),
 (10,
  '0.016*"pis" + 0.009*"send" + 0.007*"email" + 0.006*"want" + 0.006*"speech"'),
 (11,
  '0.018*"know" + 0.012*"im" + 0.009*"time" + 0

# Word2Vec

In [73]:
model = gensim.models.Word2Vec(texts, size=100, window=5, min_count=1, workers=2, sg=1)

In [74]:
list(model.wv.vocab.items())

[('friday', <gensim.models.keyedvectors.Vocab at 0x7f93ca283af0>),
 ('march', <gensim.models.keyedvectors.Vocab at 0x7f93cf9010a0>),
 ('huma', <gensim.models.keyedvectors.Vocab at 0x7f93cc345f40>),
 ('abedin', <gensim.models.keyedvectors.Vocab at 0x7f93cc3455e0>),
 ('latest', <gensim.models.keyedvectors.Vocab at 0x7f93cc345fa0>),
 ('syria', <gensim.models.keyedvectors.Vocab at 0x7f93cd9d8f70>),
 ('aiding', <gensim.models.keyedvectors.Vocab at 0x7f93cd9d8fd0>),
 ('qaddafi', <gensim.models.keyedvectors.Vocab at 0x7f93cd9d8e20>),
 ('sid', <gensim.models.keyedvectors.Vocab at 0x7f93cd9d8fa0>),
 ('hrc', <gensim.models.keyedvectors.Vocab at 0x7f93cd9d8d60>),
 ('memo', <gensim.models.keyedvectors.Vocab at 0x7f93cb490550>),
 ('libya', <gensim.models.keyedvectors.Vocab at 0x7f93cb490940>),
 ('pis', <gensim.models.keyedvectors.Vocab at 0x7f93cb490760>),
 ('wednesday', <gensim.models.keyedvectors.Vocab at 0x7f93cb490700>),
 ('september', <gensim.models.keyedvectors.Vocab at 0x7f93cb490580>),
 ('m

In [29]:
print(model.wv['benghazi'])

[-1.3744071e-01  2.1587807e-01  2.0902865e-01  1.4564884e-01
 -1.2782425e-01  1.9792040e-01 -1.0587117e-01  1.7459910e-01
 -5.1607843e-02  1.5084288e-01 -9.9735267e-02  2.9465685e-02
  1.7406812e-01  1.6365130e-01  5.3840049e-02  1.7888747e-01
  7.3124193e-02 -7.7356085e-02  6.5710761e-02  6.8378076e-02
 -1.0586783e-01  1.3181430e-01  1.2418494e-01  2.6966127e-02
  4.0494174e-02 -3.7634242e-02  5.3996742e-02  1.8736485e-01
 -1.8742085e-01  1.6028783e-01 -2.5258368e-01  2.3633880e-03
  2.3891433e-01 -2.0739751e-02  1.3992932e-01 -1.2231517e-01
  1.4331503e-01 -2.0111375e-01 -2.2342625e-01  2.2868878e-01
  3.3411391e-02  1.0723253e-01  3.0729417e-02  1.6988139e-01
  1.8455836e-01  4.3175962e-02 -1.4618760e-01  4.0477823e-04
 -2.7955461e-01  4.6051618e-02 -6.3923299e-03 -2.4955556e-01
  1.6992606e-01 -1.8006587e-02  1.7565595e-01 -1.6663048e-02
 -1.2433256e-01  4.5467220e-02 -2.1130955e-01  4.0590316e-02
 -2.6200250e-01 -6.8894781e-02 -1.0786487e-02  1.2509571e-01
  9.4524845e-02  8.97383

In [65]:
model.wv.most_similar('benghazi' ,topn=10)

[('comm', 0.9994080662727356),
 ('select', 0.9991276264190674),
 ('agreement', 0.9990670680999756),
 ('subject', 0.9989519715309143),
 ('house', 0.9988731145858765),
 ('sensitive', 0.9986786842346191),
 ('information', 0.998456597328186),
 ('foia', 0.9982517957687378),
 ('redactions', 0.998194694519043),
 ('waiver', 0.9980608224868774)]

In [44]:
model.wv.similarity('ive','us')

0.9994151

# Top

In [93]:
doclist[15]

'Pis print Friday September PM Fw CNN Belief Blog Prothero'

# Multiprocessing

In [58]:
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from nltk import bigrams
from collections import defaultdict

In [46]:
cores = multiprocessing.cpu_count()

In [50]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [53]:
sent = [row.split() for row in docs]

In [55]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [62]:
sentences = bigrams[sent]

NameError: name 'nltk' is not defined

In [51]:
w2v_model.build_vocab(sentences, progress_per=10000)

NameError: name 'sentences' is not defined

In [59]:
phrases

<gensim.models.phrases.Phrases at 0x7f93ce21beb0>