# Homework 5 - Part 3

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from wordcloud import WordCloud
import gensim
from gensim import corpora, models
import seaborn as sns



For the initial cleaning of the data, we run the same steps as in part 1. First, we load the dataset.

In [2]:
df_Emails = pd.read_csv('hillary-clinton-emails/Emails.csv')

Then we apply the same cleaning function as part 1 to rmoving unwanted parts of text.

In [3]:
def cleanMail(mail):
    res = ''
    for l in mail.split('\n'):
        if not (l.startswith('UNCLASSIFIED') | l.startswith('U.S. Department of State') | l.startswith('Case No.') |
               l.startswith('Doc No.') | l.startswith('Date: ') |
                l.startswith('STATE DEPT. ') | l.startswith('SUBJECT TO ') | l.startswith('RELEASE ') | l.startswith('PART') |
               l.startswith('From:') | l.startswith('To:') | l.startswith('Sent:')):
            res+=(' ' + l)
    return res

In [4]:
df_Emails.RawText = df_Emails.RawText.apply(cleanMail)

We loaded the english stop words and update the list with relevant words for this exercise.

In [5]:
stop = set(stopwords.words('english')) # take a typical stop words list for english

In [6]:
stop.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '@', '<', '>', '-', 'subject', 'fw', 'cc', 'am', 'pm', 'date', '``', '--', '—', 'http', 'u.s.', 'u.s', "n't", '&', '%', '*', '•', '#', "'s", "''", ])

Then from the raw data, we create a new text. Each mail is decomposed in a list of words and they are cleaned using the stop words list.

In [7]:
new_text = []
for mail in df_Emails.RawText:
    new_mail = []
    for word in word_tokenize(mail):
        if word.lower() not in stop:
            new_mail.append(word.lower())
    new_text.append(new_mail)      

From this new text, we define the corpus and the dictionnary that will be used to create the LDA model.

In [8]:
dictionary = corpora.Dictionary(new_text)
corpus = [dictionary.doc2bow(t) for t in new_text] 

Finally we create the models for 5, 10, 20, 30, 40 and 50 topics and we print the 20 most relevant words for each topic.

In [9]:
# 5 Topics
lda = models.LdaModel(corpus, id2word= dictionary, num_topics = 5)
for i in range(lda.num_topics):
    topics = [w for w, _ in lda.show_topic(i, topn=20)]
    print('Topic ' + str(i) + ' : ' + str(topics))

Topic 0 : ['message', 'original', 'said', 'state', 'would', '1.4', 'unclassified', 'call', 'minister', 'president', 'government', 'secretary', 'deal', 'also', 'meeting', 'department', 'security', 'doc', 'dup', 'f-2014-20439']
Topic 1 : ['state', 'secretary', 'one', 'office', 'department', 'president', 'obama', 'would', 'new', 'us', 'case', 'time', 'doc', 'clinton', 'unclassified', 'foreign', 'afghanistan', 'said', 'house', 'f-2014-20439']
Topic 2 : ['said', 'state', 'would', 'government', 'israel', 'new', 'department', 'obama', 'president', 'haiti', 'american', 'people', 'united', 'states', 'case', 'also', 'us', 'one', 'unclassified', 'peace']
Topic 3 : ['would', 'state', '1', '2', 'department', 'case', 'unclassified', 'f-2014-20439', 'vote', 'new', '2010', '3', 'message', '4', 'obama', 'may', 'government', 'said', 'president', 'senate']
Topic 4 : ['message', 'original', 'call', 'state', 'department', 'case', 'b6', 'unclassified', 'doc', 'f-2014-20439', 'know', 'would', '08/31/2015', '

In [10]:
# 10 Topics
lda = models.LdaModel(corpus, id2word= dictionary, num_topics = 10)
for i in range(lda.num_topics):
    topics = [w for w, _ in lda.show_topic(i, topn=20)]
    print('Topic ' + str(i) + ' : ' + str(topics))

Topic 0 : ['said', 'reuters', 'president', 'clinton', 'would', 'us', 'case', 'people', 'new', 'message', 'rights', 'american', '...', 'state', 'also', 'ap', 'unclassified', 'one', 'department', 'freedom']
Topic 1 : ['secretary', 'office', 'state', 'meeting', 'department', 'room', 'arrive', 'route', 'depart', 'private', 'time', 'conference', 'residence', 'minister', 'daily', 'staff', 'airport', 'unclassified', 'en', 'house']
Topic 2 : ['message', 'original', 'call', 'b6', 'tomorrow', 'huma', 'state.gov', 'abedin', 'today', 'get', 'w', 'know', 'would', 'h', 'also', 'talk', 'morning', 'time', 'let', 'update']
Topic 3 : ['obama', 'said', 'state', 'clinton', 'would', 'israel', 'president', 'new', 'one', 'department', 'israeli', '08/31/2015', 'american', 'house', 'doc', 'also', 'us', 'f-2014-20439', 'administration', 'case']
Topic 4 : ['1.4', 'b', 'call', 'message', 'state', 'b1', 'original', '1', 'department', 'haiti', 'case', '2', 'unclassified', 'f-2014-20439', 'b6', '3', 'email', 'doc', 

In [11]:
# 25 Topics
lda = models.LdaModel(corpus, id2word= dictionary, num_topics = 25)
for i in range(lda.num_topics):
    topics = [w for w, _ in lda.show_topic(i, topn=20)]
    print('Topic ' + str(i) + ' : ' + str(topics))

Topic 0 : ['boehner', 'new', 'would', 'house', 'party', 'one', 'doc', 'time', 'state', 'case', 'unclassified', 'israel', 'health', 'care', '...', 'f-2014-20439', 'government', '08/31/2015', 'year', 'many']
Topic 1 : ['said', 'haiti', 'reuters', 'children', 'government', 'would', 'new', 'people', 'state', 'could', 'case', 'also', 'american', 'haitian', '08/31/2015', 'trafficking', 'bellerive', 'euro', 'country', 'last']
Topic 2 : ['turkish', 'state', 'unclassified', 'message', 'ecumenical', 'said', 'inspirational', 'irish', 'department', 'turkey', 'received', 'case', 'president', 'government', 'one', 'e-mail', 'time', 'daughter', 'wilson', 'patriarch']
Topic 3 : ['call', 'message', 'original', 'sullivan', 'j', 'jacob', 'sheet', 'b6', 'text', 'email', 'know', 'w', 'need', 'state', 'davutoglu', 'let', 'b5', 'e', 'b', 'unclassified']
Topic 4 : ['afghanistan', 'would', 'said', 'mr', 'obama', 'brown', 'afghan', 'party', 'election', 'government', 'one', 'new', 'us', 'labour', 'cameron', 'tori

In [12]:
# 50 Topics
lda = models.LdaModel(corpus, id2word= dictionary, num_topics = 50)
for i in range(lda.num_topics):
    topics = [w for w, _ in lda.show_topic(i, topn=20)]
    print('Topic ' + str(i) + ' : ' + str(topics))

Topic 0 : ['la', '08/31/2015', 'michael', 'que', 'tan', 'kitty', 'dimartino', 'time', 'reads', 'state', 'b6', 'judith', 'craig', 'mchale', 'department', 'hv', 'mumbai', 'mellott', 'case', 'onward']
Topic 1 : ['said', 'northern', 'ireland', 'police', 'one', 'killed', 'ses-o_shift-iii', 'policing', 'death', 'church', 'two', 'irish', 'richardson', 'city', 'secretary', 'state', 'department', 'officials', 'also', 'statement']
Topic 2 : ['state', 'sudan', 'beck', 'video', 'department', 'new', 'states', 'us', 'government', 'united', 'group', 'case', '2', 'facebook', 'tea', 'would', 'foreign', 'beside', '1', 'also']
Topic 3 : ['government', 'said', 'people', 'bangladesh', 'euro', 'state', 'minister', 'case', 'greece', 'department', 'prime', 'would', '...', 'also', 'doc', 'message', 'unclassified', 'f-2014-20439', 'january', 'adoption']
Topic 4 : ['b6', 'b5', 'h', 'cheryl', 'state', '2010', 'unclassified', 'mills', 'joanne', '»', 'state.gov', 'sent', 'fyi', 'daniel', 'message', 'would', 'fa', '

From these results, it seems that the results are more meaningful for 25 topics.