In [1]:
import numpy as np
import pandas as pd
import re
df = pd.read_csv('HillaryEmails.csv')
df = df[['Id','ExtractedBodyText']].dropna()

In [2]:
def clean_email_text(text):
    text = text.replace('\n'," ") 
    text = re.sub(r"-", " ", text)
    text = re.sub(r"\d+/\d+/\d+", "", text) 
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) 
    text = re.sub(r"[\w]+@[\.\w]+", "", text) 
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) 
    pure_text = ''
    
    for letter in text:
     
        if letter.isalpha() or letter==' ':
            pure_text += letter
  
    text = ' '.join(word for word in pure_text.split() if len(word)>1)
    return text

In [3]:
docs = df['ExtractedBodyText']
docs = docs.apply(lambda s: clean_email_text(s))  
docs.head(1).values

array(['Thursday March PM Latest How Syria is aiding Qaddafi and more Sid hrc memo syria aiding libya docx hrc memo syria aiding libya docx March For Hillary'],
      dtype=object)

In [4]:
doclist = docs.values
doclist

array(['Thursday March PM Latest How Syria is aiding Qaddafi and more Sid hrc memo syria aiding libya docx hrc memo syria aiding libya docx March For Hillary',
       'Thx',
       'Friday March PM Huma Abedin Fw Latest How Syria is aiding Qaddafi and more Sid hrc memo syria aiding libya docx Pis print',
       ...,
       'Big change of plans in the Senate Senator Reid just announced that he was no longer going to move forward with the omnibus appropriations bill Instead he filed cloture motions on the repeal of Dont Ask Dont Tell and the DREAM Act Those petitions will ripen on Saturday So it looks like the Senate will be again considering the new START Treaty tomorrow We should know the starting time shortly',
       'PVerveer Friday December AM From Please let me know if can be of any help to your department and will happy to do and please thank Mrs Hillary Clinton on behalf of me and supporting Afghan women Thank you',
       'See below'], dtype=object)

In [5]:
from gensim import corpora, models, similarities
import gensim
stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 
            'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 
            'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 
            'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 
            'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 
            'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 
            'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 
            'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 
            'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 
            'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 
            'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 
            'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which']
texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in doclist]

texts[0]

['thursday',
 'march',
 'pm',
 'latest',
 'syria',
 'aiding',
 'qaddafi',
 'sid',
 'hrc',
 'memo',
 'syria',
 'aiding',
 'libya',
 'docx',
 'hrc',
 'memo',
 'syria',
 'aiding',
 'libya',
 'docx',
 'march',
 'hillary']

In [6]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [7]:
corpus[13]

[(51, 1), (505, 1), (506, 1), (507, 1), (508, 1)]

In [19]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)
#第5号分类，其中最常出现的单词是：
lda.print_topic(5, topn=5)

'0.012*"call" + 0.009*"senate" + 0.007*"pm" + 0.006*"time" + 0.006*"pis"'

In [20]:
#Print all topics and return to the topic's high-frequency vocabulary
lda.print_topics(num_words=5)

[(0,
  '0.031*"ok" + 0.014*"know" + 0.012*"thx" + 0.007*"let" + 0.006*"germany"'),
 (1,
  '0.020*"call" + 0.013*"see" + 0.011*"release" + 0.011*"pm" + 0.010*"part"'),
 (2,
  '0.009*"us" + 0.007*"one" + 0.006*"state" + 0.006*"american" + 0.006*"would"'),
 (3,
  '0.007*"pls" + 0.006*"get" + 0.005*"also" + 0.005*"would" + 0.005*"work"'),
 (4,
  '0.072*"pm" + 0.035*"office" + 0.030*"secretarys" + 0.023*"fyi" + 0.020*"meeting"'),
 (5,
  '0.012*"call" + 0.009*"senate" + 0.007*"pm" + 0.006*"time" + 0.006*"pis"'),
 (6, '0.008*"us" + 0.007*"new" + 0.005*"said" + 0.005*"un" + 0.005*"people"'),
 (7, '0.007*"mr" + 0.006*"new" + 0.006*"would" + 0.004*"one" + 0.004*"time"'),
 (8,
  '0.012*"pm" + 0.008*"party" + 0.006*"us" + 0.005*"would" + 0.005*"sunday"'),
 (9,
  '0.010*"would" + 0.009*"obama" + 0.008*"percent" + 0.007*"president" + 0.006*"said"')]