In [None]:
import pandas as pd
from os import path
from wordcloud import WordCloud

%matplotlib inline

# Part 0: Loading and cleaning the data

**First we need to load the emails into a dataframe.**

In [None]:
# Read Emails.csv into pandas
emails_dir = "hillary-clinton-emails"
emails_csv_filename = "Emails.csv"

emails_path = path.join(emails_dir, emails_csv_filename)

emails_df = pd.read_csv(emails_path)

# Take a peek
emails_df.sample(5)

**We want to create our wordcloud from the column(s) that contain(s) the full text of the emails**

In [None]:
# Which columns of the dataframe contain the text we want for our wordcloud?
emails_df.columns

#I found:
#- ExtractedSubject
#- ExtractedBodyText
#- RawText

# TODO SV check these columns to see if they're actually filled in and make sense
emails_df.isnull().sum()

**TODO I decided to use the RawText for the wordcloud and might use ExtractedSubject and ExtractedBodyText in later steps.**

# Part 1: Generating wordclouds!

In [None]:
#Concatenate all raw text
concatenated_raw_text = ' '.join(emails_df['RawText'])
len(concatenated_raw_text)

# Generate a word cloud image
wordcloud = WordCloud().generate(concatenated_raw_text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")

**The wordcloud library helpfully removed common english stopwords, but we still have some boring terms like 'date', 'subject', and 'message' in our cloud.**

**We can improve the results by doing some pre-processing:**
- remove punctuation? TODO didn't bother with this
- tokenization
- stopword removal
- stemming

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download()
#NOTE to team you should run nltk.download() but maybe outside the notebook

In [None]:
#Takes a few seconds to run:
tokenized = nltk.word_tokenize(concatenated_raw_text)

In [None]:
tokenized[:20]

**Let's remove some more stopwords**

In [None]:
#
email_stopwords = nltk.corpus.stopwords.words('english')
additional_stopwords = ['subject', 'date', 're', 'cc', 'bcc', 'fwd', 'fw', 'sent',
                       'state', 'department', 'unclassified', 'message', 'case']
email_stopwords.extend(additional_stopwords)

In [None]:
tokens_without_stopwords = [t for t in tokenized if t.lower() not in email_stopwords]

**Stemming - we use the porter stemmer to normalize our text**

In [None]:
porter = nltk.PorterStemmer()
stemmed = [porter.stem(t) for t in tokens_without_stopwords]
stemmed[:20]

In [None]:
wordcloud2 = WordCloud().generate(" ".join(stemmed))

plt.imshow(wordcloud2)
plt.axis("off")

**The wordcloud looks quite a bit different - now we can see some interesting common words like "Cheryl" "Abedin" etc. This approach has some pros and cons:**
- (+) we were able to remove some of the obviously common words
- (-) stemming doesn't seem that useful
- (-) removing words is sort of an iterative process as each time we generate the wordcloud we see new frequent words to exclude

# 2. World country mentions

In [None]:
import pycountry

In [None]:
plt.imshow(WordCloud().generate(" ".join(tokens_without_stopwords)))
plt.axis("off")

In [None]:
import pycountry

In [None]:
countries = {country: 0 for country in pycountry.countries}

In [None]:
bad_tokens = set()

for token in tokens_without_stopwords:
    if token not in bad_tokens:
        try:
            country = pycountry.countries.lookup(token)
            countries[country] += 1
        except LookupError:
            bad_tokens.add(token)

In [None]:
country_df = pd.DataFrame.from_dict({country.name: occurence for (country, occurence) in countries.items()}, orient="index")
country_df.columns = ["Occurences"]

In [None]:
country_df.sort_values(by="Occurences", ascending=False).head(10)

In [None]:
print(pycountry.countries.lookup("PM").name)
print(pycountry.countries.lookup("AM").name)

**Saint Pierre and Miquelon refers to "PM". "AM" was already in the stop-words, that's why we don't have problems with Armenia. We could start again with "pm" as a stopword, but Saint Pierre and Miquelon is very likely not to appear in any email.**

In [None]:
country_df.loc["Saint Pierre and Miquelon"] = 0
country_df.sort_values(by="Occurences", ascending=False).head(10)