In [7]:
import re
import numpy as np
import pandas as pd
import ftfy
import pycountry
import sqlite3

# Modeling: Part 1
from textblob import TextBlob
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
# Modeling: Part 2
import gensim
import matplotlib.pyplot as plt
from wordcloud import WordCloud


# Visualization
import matplotlib.pyplot as plt
import seaborn
con = sqlite3.connect('database.sqlite')

emails = pd.read_sql_query("""
SELECT p.Name Sender,
       e.SenderPersonId Id_sender, e.MetadataTo, e.ExtractedBodyText text,
       a.PersonId
FROM Emails e
INNER JOIN Persons p ON e.SenderPersonId=p.Id 
LEFT OUTER JOIN Aliases a ON lower(e.MetadataTo)=a.Alias
where strftime('%Y', e.MetadataDateSent) = '2012'
""", con)

persons = pd.read_sql_query("""
SELECT Id, Name
FROM Persons 
""", con)
personsDict = {}
for i in persons.values:
    personsDict[i[0]] = i[1]
    
    
def computeSender(item):
    # Sender is Hillary Clinton
    if item.Id_sender == 80 and item.MetadataTo != '' and np.isnan(item.PersonId):
        tab = item.MetadataTo.split(',')
        name = tab[1].strip() + ' ' + tab[0].strip() 
        tmp = pd.read_sql_query("SELECT Id, Name FROM Persons WHERE Name='"+ name +"'", con)
        # A person was found
        if not tmp.empty:
            item.PersonId = tmp['Id'][0]
    # Create the new Contact column
    if item.Id_sender == 80:
        item['Id_Contact'] = item.PersonId
    else:
        item['Id_Contact'] = item.Id_sender
    return item
print("Number of emails before cleaning : ",emails.shape[0])

data = emails.apply(computeSender, axis=1);

# Remove the not found persons
data = data[(~np.isnan(data.PersonId)) | (data.Id_sender != 80)]
data = data[data.Id_Contact != 80]
data['Id_Contact'] = data['Id_Contact'].apply(lambda i : personsDict[int(i)])

print("Number of emails after cleaning : ",data.shape[0])
print("Number of unique contacts : ", data['Id_Contact'].unique().shape[0])
corpusTmp = {}
corpus = {}
for i, email in enumerate(data.values):
    corpusTmp[email[5]] = corpusTmp.get(email[5], "") + email[3]
    
occ = []
for key, val in corpusTmp.items():
    if int(len(val)) > 10:
        corpus[key] = val
contacts = list(corpus.keys())


X = corpus.get('Cheryl Mills')
no_digits = []
# Iterate through the string, adding non-numbers to the no_digits list
for i in X:
    if not i.isdigit():
        no_digits.append(i)

# Now join all elements of the list with '', 
# which puts all of the characters together.
result = ''.join(no_digits)

from nltk import word_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
new_words = open("stopwords_en.txt").readlines()
new_words = [x.strip() for x in new_words]
new_stopwords = stop.union(new_words)
sentence = result
print ([i for i in sentence.lower().split() if i not in new_stopwords])
Y = sentence.lower().split()

# Adding to stop words frozenset per Sklearn docs
#my_stopwords = text.ENGLISH_STOP_WORDS.union(new_stopwords)
cv = CountVectorizer(max_features=1000, stop_words = new_stopwords, ngram_range=(1, 2), max_df=0.98)
cv_X = cv.fit_transform(Y)
# print vectorized_X.toarray()
# Saving words
word_col = cv.get_feature_names()
word_col[:10]

lda_params = {'n_topics':[1,10,20,50]}
lda = LatentDirichletAllocation()
lda_grid = GridSearchCV(lda, lda_params)
lda_grid.fit(cv_X)
lda_grid.best_estimator_
lda_model = LatentDirichletAllocation(n_topics=5, n_jobs=-1).fit(cv_X)

doctopic = lda_model.fit_transform(cv_X)
def topic_TopWords(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("\n Topic {}: \n".format(topic_idx+1))
        top_words = [feature_names[i] for i in topic.argsort()[::-1][:n_top_words]]
        print (' '.join(sorted(top_words,key=len,reverse=True)))
        
topic_TopWords(lda_model,word_col,10)









Number of emails before cleaning :  244
Number of emails after cleaning :  237
Number of unique contacts :  23
['thx_', '.....', 'randolph,', 'lawrence', 'sent:', 'wednesday,', 'september', ',', ':', 'to:', 'mills,', 'subject:', 're:', 'dry', 'eye', 'nea', 'including', 'mine.', 'remarks', 'moving.', 'chriswas', 'amazing', 'man.', 'huge', 'loss.', 'know,', 'libya', 'coming', 'ten', 'years,', 'worked', 'calm,', 'cool', 'headed,', 'funny', 'diplomat.', 'easy-', 'hardest', 'places', 'work', 'world.remind', 'discussembassy', 'yemenu.s.', 'no.', 'f--', 'no.', 'date:', '//', 'dept.', '-', 'house', 'benghazi', 'comm.', 'subject', 'sensitive', 'information', '&', 'redactions.', 'waiver.', 'state-cbfyifrom', 'pa', 'clips', '[mailtotaclips@state.govl', 'sent;', 'saturday,', 'october', ',', ':', 'pa-monitoring-group-di;', 'nea-press-dl;', 'ds', 'pa', 'media;', 'shore,', 'rhonda', 'h;', 'atchison,', 'mark', 'c;', 'moe,', 'grace', 't;', 'coordination;', 'mills,', 'd;', 'tow,', 'nora', 'f;', 'pg', 's





 Topic 1: 

administration secretary american subject testify patrick kennedy monday susan pat

 Topic 2: 

intelligence spontaneous washington attack peress house comm date part gov

 Topic 3: 

information redactions sensitive benghazi november diplomat matters waiver mccain white

 Topic 4: 

ambassador facilities thursday release reports daily shows john back work

 Topic 5: 

wednesday december foreign january policy libya rice time sen al
