In [None]:
import pandas as pd
import seaborn as sns
import matplotlib
import string

from os import path
from wordcloud import WordCloud

from nltk.stem import WordNetLemmatizer

%matplotlib inline

# Part 0: Loading and cleaning the data

**First we need to load the emails into a dataframe.**

In [None]:
# Read Emails.csv into pandas
emails_dir = "hillary-clinton-emails"
emails_csv_filename = "Emails.csv"

emails_path = path.join(emails_dir, emails_csv_filename)

emails_df = pd.read_csv(emails_path)

# Take a peek
emails_df.sample(5)

**We want to create our wordcloud from the column(s) that contain(s) the full text of the emails**

In [None]:
# Which columns of the dataframe contain the text we want for our wordcloud?
emails_df.columns

#I found:
#- ExtractedSubject
#- ExtractedBodyText
#- RawText

# TODO SV check these columns to see if they're actually filled in and make sense
emails_df.isnull().sum()

**TODO I decided to use the RawText for the wordcloud and might use ExtractedSubject and ExtractedBodyText in later steps.**

# Part 1: Generating wordclouds!

In [None]:
#Concatenate all raw text
concatenated_raw_text = ' '.join(emails_df['RawText'])
len(concatenated_raw_text)

# Generate a word cloud image
wordcloud = WordCloud().generate(concatenated_raw_text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")

**The wordcloud library helpfully removed common english stopwords, but we still have some boring terms like 'date' and 'subject' in our cloud.**

**We can improve the results by doing some pre-processing:**
- tokenization
- stopword removal
- lemmatization
- stemming
- remove punctuation & single char tokens

In [None]:
import nltk
from nltk.corpus import stopwords
#NOTE to team you should run nltk.download() but maybe outside the notebook

In [None]:
#Takes a few seconds to run:
tokenized = nltk.word_tokenize(concatenated_raw_text)

In [None]:
tokenized[:10]

**Let's remove some more stopwords**

In [None]:
#
email_stopwords = nltk.corpus.stopwords.words('english')
additional_stopwords = ['subject', 'date', 're', 'cc', 'bcc', 'fwd', 'fw', 'sent', 'mr', 'mrs']
email_stopwords.extend(additional_stopwords)

In [None]:
tokens_without_stopwords = [t.lower() for t in tokenized if t.lower() not in email_stopwords]

**Lemmatization - we use the WordnetLemmatizer to group inflected forms of a word**

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens_without_stopwords]

**Stemming - we use the porter stemmer to normalize our text**

In [None]:
porter = nltk.PorterStemmer()
processed_tokens = [porter.stem(t) for t in lemmatized_tokens]

**Remove punctuation & single character token**

In [None]:
def remove_punctuation(token):
    for c in string.punctuation:
        token = token.replace(c, "")
    
    return token

In [None]:
processed_tokens = [token for token in processed_tokens if len(token) > 1]
processed_tokens = [remove_punctuation(token) for token in processed_tokens]

In [None]:
processed_tokens[:10]

In [None]:
# All processing at once. We don't apply stemming on emails, since it would destroy most country names
def process_text(text):
    tokens = nltk.word_tokenize(text)
    lowered = (t.lower() for t in tokens)
    processed = (lemmatizer.lemmatize(t) for t in lowered if t not in email_stopwords)
    processed = (remove_punctuation(t) for t in processed)
    processed = (t for t in processed if len(t) > 1)
            
    return list(processed)

In [None]:
wordcloud2 = WordCloud().generate(" ".join(processed_tokens))

plt.imshow(wordcloud2)
plt.axis("off")

**The wordcloud looks quite a bit different - now we can see some interesting common words like "Cheryl" "Abedin" etc. This approach has some pros and cons:**
- (+) we were able to remove some of the obviously common words
- (-) stemming doesn't seem that useful
- (-) removing words is sort of an iterative process as each time we generate the wordcloud we see new frequent words to exclude

# 2. World country mentions

In [None]:
import pycountry

In [None]:
countries = {country: 0 for country in pycountry.countries}

In [None]:
bad_country_tokens = set()
concatenated_body_text = " ".join(str(email) for email in emails_df["ExtractedBodyText"])

for token in process_text(concatenated_body_text):
    if token not in bad_country_tokens:
        try:
            country = pycountry.countries.lookup(token)
            countries[country] += 1
        except LookupError:
            bad_country_tokens.add(token)

In [None]:
country_df = pd.DataFrame.from_dict({country.name: occurence for (country, occurence) in countries.items()}, orient="index")
country_df.columns = ["Occurences"]

In [None]:
country_df.sort_values(by="Occurences", ascending=False).head(10)

In [None]:
print(pycountry.countries.lookup("PM").name)
print(pycountry.countries.lookup("AM").name)

**Saint Pierre and Miquelon refers to "PM". "AM" was already in the stop-words, that's why we don't have problems with Armenia. We could start again with "pm" as a stopword, but Saint Pierre and Miquelon is very likely not to appear in any email.**

In [None]:
country_df.loc["Saint Pierre and Miquelon"] = 0
country_df.sort_values(by="Occurences", ascending=False).head(10)

In [None]:
vader = nltk.sentiment.vader.SentimentIntensityAnalyzer()

In [None]:
vader.polarity_scores("I love you")

In [None]:
vader.polarity_scores("I hate you")

In [None]:
email_stopwords.append("pm")

In [None]:
import os.path
import pickle

# Create cache folder
CACHE_DIR = "cache"
CACHE_PATH = os.path.join(CACHE_DIR, "country_sentiment_df.bak")

if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)

In [None]:
country_sentiment_list = []

# !! If not in the cache, takes ~10 min to run
if os.path.isfile(CACHE_PATH):
    with open(CACHE_PATH, 'rb') as handle:
        country_sentiment_df = pickle.load(handle)

else:
    for (index, email) in emails_df.iterrows():
        text = email["ExtractedBodyText"]
        words = process_text(str(text))

        for word in words:
            if word not in bad_country_tokens:
                country = pycountry.countries.lookup(word)
                sentiment = vader.polarity_scores(text)["compound"]
                country_sentiment_list.append([country.name, sentiment])
                break

    country_sentiment_df = pd.DataFrame(country_sentiment_list, columns=["Country", "Sentiment"])
    
    with open(CACHE_PATH, 'wb') as handle:
        pickle.dump(country_sentiment_df, handle)

In [None]:
sentiment_by_country = country_sentiment_df.groupby("Country")


sentiment_occurence_df = sentiment_by_country.count().join(sentiment_by_country.mean(), lsuffix='_left')
sentiment_occurence_df.columns = ["Frequency", "Sentiment"]

**First, a full & difficult to read plot of sentiment towards countries. Then, we only show countries with most positive / most negative sentiment. We can see that the emails globally have a positive sentiment.**

In [None]:
palette = sns.color_palette("coolwarm_r", n_colors=len(sentiment_occurence_df))
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)
sentiment_occurence_df["Sentiment"].sort_values().plot(kind="bar", color=palette)

In [None]:
matplotlib.rcParams['figure.figsize'] = (8, 4)

# Most negative sentiment
sentiment_occurence_df["Sentiment"].sort_values().head(25).plot(kind="bar", color=palette)

In [None]:
# Most positive sentiment
sentiment_occurence_df["Sentiment"].sort_values().tail(25).plot(kind="bar")

# 3. Topic modeling

In [None]:
import gensim.models.ldamodel as ldamodel
import gensim.corpora as corpora

In [None]:
texts = [process_text(str(email)) for email in emails_df["ExtractedBodyText"]]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=10)

In [None]:
def pretty_print_topics(lda):
    for topics in lda.show_topics(num_topics=20, num_words=8, formatted=False):
        print([topic[0] for topic in topics[1]])
        

In [None]:
# !! Takes a few minutes

for topics_count in range(5, 50, 10):
    print("{} topics".format(topics_count))
    pretty_print_topics(ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=topics_count))
    print()

**It looks like 45 topics gives the most meaningful results**