In [11]:
# Code used in part 1 of How I used machine learning to classify emails and turn them into insights.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import normalize 
from sklearn.metrics.pairwise import linear_kernel
from query import EmailDataset

import nltk
from nltk.corpus import stopwords

from helpers import * 

%matplotlib inline

In [2]:
emails = pd.read_csv('emails.csv')

In [3]:
# email_df = pd.read_csv('split_emails.csv')
emails = pd.read_csv('emails.csv')

# Lets create a new frame with the data we need.
email_df = pd.DataFrame(parse_into_emails(emails.message))

# Drop emails with empty body, to or from_ columns. 
email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)

In [4]:
# Flag email as important if the title start with re, fw or fwd
email_df.loc[:, 'isImportant'] = email_df.subject.str.lower().str.split(':').apply(lambda x:x[0] in ['re', 'fw', 'fwd'])

In [5]:
email_df["folder"] = emails.file.str.split('/').apply(lambda x: '/'.join(x[:-1]) if type(x)==list and len(x) else None)
email_df["user"] = email_df.folder.str.split('/').apply(lambda x:x[0] if type(x)==list and len(x) else None)

In [7]:
folder_num = email_df.groupby('user').apply(lambda x:x.folder.unique().shape[0])

In [8]:
email_low = email_df.loc[email_df.user.isin(folder_num.loc[folder_num<15].index)]
email_mid = email_df.loc[email_df.user.isin(folder_num.loc[(folder_num<=50) & (folder_num>=15)].index)]
email_high = email_df.loc[email_df.user.isin(folder_num.loc[(folder_num>50)].index)]

In [55]:
def model_eval(train, test, email):
    stop_words = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
    X, model, features, trans = {}, {}, {}, {}
    for i in email.user.unique():
        view = train.loc[train.user == i]
        vect = TfidfVectorizer(analyzer='word', stop_words=stop_words, max_df=0.3, min_df=2)
        X[i] = vect.fit_transform(view.subject)
        features[i] = vect.get_feature_names()
        trans[i] = vect

    count = 0
    for i in range(test.shape[0]):
#         clear_output(wait=True)
        view = test.iloc[i]
#         print(view.subject)
        distance =  linear_kernel(trans[view.user].transform([view.subject]), X[view.user]).flatten()
        tmp = train.loc[train.user == view.user, ['folder']]
        tmp.loc[:, 'distance'] = distance
        distance = tmp.groupby('folder').max()
#         print(distance)
        order = distance.rank(ascending=False)
        
#         print(test.iloc[i].folder, order.distance.argmin())
        if (test.iloc[i].folder == order.distance.argmin()):
            count += 1
#             print(i)
            
#         print("Finished {:.2f}%".format(i/test.shape[0]*100))

    print("Accuracy {:2f}%".format(count/test.shape[0]*100))

In [None]:
for email in [email_low, email_mid, email_high]:
    # sparsity  score
    sparsity = email.groupby(['user', 'folder']).subject.count().groupby(['user']).apply(lambda x:1/(x**2).sum()*x.sum()**2)

    names = sparsity.loc[sparsity<sparsity.quantile(1/3)].index
    email_dense = email.loc[email.user.isin(names)]

    names = sparsity.loc[sparsity>sparsity.quantile(2/3)].index
    email_sparse = email.loc[email.user.isin(names)]

    names = sparsity.loc[(sparsity>=sparsity.quantile(1/3)) & (sparsity<=sparsity.quantile(2/3))].index
    email_normal = email.loc[email.user.isin(names)]
    for e in [email_dense, email_sparse, email_normal]:
        check = e.copy()
        reorder = np.arange(check.shape[0])
        check.index = reorder
        np.random.shuffle(reorder)
        check = check.iloc[reorder, :]
        check = check.dropna()
        train, test = check.iloc[:int(e.shape[0]/5*4)], check.iloc[int(e.shape[0]/5*4):]
        model_eval(train, test, e)