In [22]:
# Code used in part 1 of How I used machine learning to classify emails and turn them into insights.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import normalize 

import nltk
from nltk.corpus import stopwords
from IPython.display import display, clear_output

from helpers import * 

%matplotlib inline

In [3]:
from sklearn.metrics.pairwise import linear_kernel
from query import EmailDataset
from sklearn.svm import LinearSVC

In [4]:
emails = pd.read_csv('emails.csv')

In [5]:
# email_df = pd.read_csv('split_emails.csv')
emails = pd.read_csv('emails.csv')

# Lets create a new frame with the data we need.
email_df = pd.DataFrame(parse_into_emails(emails.message))

# Drop emails with empty body, to or from_ columns. 
email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)

In [6]:
# Flag email as important if the title start with re, fw or fwd
email_df.loc[:, 'isImportant'] = email_df.subject.str.lower().str.split(':').apply(lambda x:x[0] in ['re', 'fw', 'fwd'])

In [7]:
email_df["folder"] = emails.file.str.split('/').apply(lambda x: '/'.join(x[:-1]) if type(x)==list and len(x) else None)
email_df["user"] = email_df.folder.str.split('/').apply(lambda x:x[0] if type(x)==list and len(x) else None)

In [10]:
folder_num = email_df.groupby('user').apply(lambda x:x.folder.unique().shape[0])

In [11]:
email_low = email_df.loc[email_df.user.isin(folder_num.loc[folder_num<15].index)]
email_mid = email_df.loc[email_df.user.isin(folder_num.loc[(folder_num<=50) & (folder_num>=15)].index)]
email_high = email_df.loc[email_df.user.isin(folder_num.loc[(folder_num>50)].index)]

In [None]:
def model_eval(train, test, email, label = 'subject'):
    stop_words = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
    X, model, features, trans = {}, {}, {}, {}
    for i in email.user.unique():
        view = train.loc[train.user == i]
        vect = TfidfVectorizer(analyzer='word', stop_words=stop_words, max_df=0.3, min_df=2)
        X[i] = vect.fit_transform(view.loc[:, label])
        features[i] = vect.get_feature_names()
        trans[i] = vect
        lsvc = LinearSVC(multi_class='ovr', C = 1.0)
        lsvc.fit(X[i], view.folder)
        model[i] = lsvc

    count = 0
    for i in range(test.shape[0]):
#         clear_output(wait=True)
        tmp = model[test.iloc[i].user].predict(trans[test.iloc[i].user].transform([test.iloc[i].loc[label]]))
        if (test.iloc[i].folder == tmp[0]):
            count += 1
#         print("Finished {:.2f}%".format(i/test.shape[0]*100))

    print("Accuracy {:2f}%".format(count/test.shape[0]*100))

In [65]:
def model_eval(train, test, email):
    stop_words = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
    X, model, features, trans = {}, {}, {}, {}
    for i in email.user.unique():
        view = train.loc[train.user == i]
        vect = TfidfVectorizer(analyzer='word', stop_words=stop_words, max_df=0.3, min_df=2)
        X[i] = vect.fit_transform(view.subject)
        features[i] = vect.get_feature_names()
        trans[i] = vect
        lsvc = LinearSVC(multi_class='ovr', C = 1.0)
        lsvc.fit(X[i], view.folder)
        model[i] = lsvc

    count = 0
    for i in range(test.shape[0]):
#         clear_output(wait=True)
        tmp = model[test.iloc[i].user].predict(trans[test.iloc[i].user].transform([test.iloc[i].subject]))
        if (test.iloc[i].folder == tmp[0]):
            count += 1
#         print("Finished {:.2f}%".format(i/test.shape[0]*100))

    print("Accuracy {:2f}%".format(count/test.shape[0]*100))

In [66]:
for email in [email_low, email_mid, email_high]:
    # sparsity  score
    sparsity = email.groupby(['user', 'folder']).subject.count().groupby(['user']).apply(lambda x:1/(x**2).sum()*x.sum()**2)

    names = sparsity.loc[sparsity<sparsity.quantile(1/3)].index
    email_dense = email.loc[email.user.isin(names)]

    names = sparsity.loc[sparsity>sparsity.quantile(2/3)].index
    email_sparse = email.loc[email.user.isin(names)]

    names = sparsity.loc[(sparsity>=sparsity.quantile(1/3)) & (sparsity<=sparsity.quantile(2/3))].index
    email_normal = email.loc[email.user.isin(names)]
    for e in [email_dense, email_sparse, email_normal]:
        check = e.copy()
        reorder = np.arange(check.shape[0])
        check.index = reorder
        np.random.shuffle(reorder)
        check = check.iloc[reorder, :]
        check = check.dropna()
        train, test = check.iloc[:int(e.shape[0]/5*4)], check.iloc[int(e.shape[0]/5*4):]
        model_eval(train, test, e)

Accuracy 64.952991%
Accuracy 23.797183%
Accuracy 28.013724%
Accuracy 40.767618%
Accuracy 19.343368%
Accuracy 21.146953%
Accuracy 33.486811%
Accuracy 14.753670%
Accuracy 22.204330%


In [69]:
for email in [email_low, email_mid, email_high]:
    # sparsity  score
    sparsity = email.groupby(['user', 'folder']).subject.count().groupby(['user']).apply(lambda x:1/(x**2).sum()*x.sum()**2)

    names = sparsity.loc[sparsity<sparsity.quantile(1/3)].index
    email_dense = email.loc[email.user.isin(names)]

    names = sparsity.loc[sparsity>sparsity.quantile(2/3)].index
    email_sparse = email.loc[email.user.isin(names)]

    names = sparsity.loc[(sparsity>=sparsity.quantile(1/3)) & (sparsity<=sparsity.quantile(2/3))].index
    email_normal = email.loc[email.user.isin(names)]
    for e in [email_dense, email_sparse, email_normal]:
        check = e.copy()
        reorder = np.arange(check.shape[0])
        check.index = reorder
        np.random.shuffle(reorder)
        check = check.iloc[reorder, :]
        check = check.dropna()
        train, test = check.iloc[:int(e.shape[0]/5*4)], check.iloc[int(e.shape[0]/5*4):]
        model_eval(train, test, e, 'body')

Accuracy 63.952791%
Accuracy 25.160563%
Accuracy 20.678900%
Accuracy 39.638768%
Accuracy 19.722650%
Accuracy 14.008363%
Accuracy 21.577962%
Accuracy 12.058728%
Accuracy 17.838612%
