Enron email sender classification

In [106]:
import pandas as pd

from spacy import load
from string import punctuation
from collections import Counter
from operator import itemgetter
from statistics import stdev
from nltk.sentiment.vader import SentimentIntensityAnalyzer


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize


Import the data from the csv file to dataframe

In [107]:
emails_df = pd.read_csv('extracted-2.csv')
emails_df.head(1)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,file_id,msg,Message-ID,Date,From,To,Subject,Mime-Version,...,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,0,0,lokay-m/lokay-m_65/,Message-ID: <11819236.1075844017427.JavaMail.e...,<11819236.1075844017427.JavaMail.evans@thyme>,"Wed, 14 Jun 2000 01:24:00 -0700 (PDT)",frozenset({'steven.harris@enron.com'}),frozenset({'michele.lokay@enron.com'}),"Reminder: WEFA Meeting Tomorrow Morning, June ...",1.0,...,7bit,Steven Harris,Michele Lokay,,,\Michelle_Lokay_Dec2000_June2001_1\Notes Folde...,LOKAY-M,mlokay.nsf,FYI\n---------------------- Forwarded by Steve...,lokay-m


Taking a sample of 2000 emails randomly from each user

In [108]:
df_sample = emails_df[["Message-ID", "content", "user"]].groupby('user').apply(lambda df: df.sample(10))


In [109]:
df_sample["doc"] = df_sample.content.apply(load('en'))

Extracting character based features from each email

In [110]:
df_characters = pd.DataFrame()
    
for index, row in df_sample.iterrows():    
    num_chars = 0
    num_letters = 0
    num_upper_case = 0
    num_lower_case = 0
    num_punctuation = 0
    num_spaces = 0
    num_nums = 0
    
    for word in row['doc']:
        for char in str(word):
            num_chars += 1
            if char.isalpha():
                num_letters += 1
            if char.isupper():
                num_upper_case += 1
            if char.islower():
                num_lower_case += 1
            if char in punctuation:
                num_punctuation += 1
            if char.isspace():
                num_spaces += 1
            if char.isnumeric():
                num_nums += 1
            
    df_row = pd.DataFrame({"chars": num_chars, "letters": num_letters, "upper_case": num_upper_case, "lower_case": num_lower_case, "punctuations": num_punctuation, "spaces": num_spaces, "nums": num_nums}, index=[0])
    df_characters = df_characters.append(df_row, ignore_index=True)

df_sample['temp'] = ""
df_characters['temp'] = 1

for i in range(df_sample['user'].count()):
    df_sample['temp'][i] = i
    df_characters['temp'][i] = i
    
df_sample = pd.merge(df_sample, df_characters, on=['temp'])  


Extracting word based features

In [111]:
df_words = pd.DataFrame()

for index, row in df_sample.iterrows():
    long_word = 5
    num_words = 0
    avg_letters_per_word = 0
    num_longwords = 0
    num_stopwords = 0
    num_error = 0
    TTR = 0
    hapaxes = []
    HTR = 0
    word_list = []
    sorted_word_frequency = []
    max_frequency = 0
    
    for word in row['doc']:
        if word.is_alpha:
            num_words += 1
            if len(word) > long_word:
                num_longwords += 1
            if word.is_stop:
                num_stopwords += 1
            word_list.append(str(word))
    
    if num_words > 0:
        avg_letters_per_word = float(row['letters']/num_words)
    else:
        avg_letters_per_word = 0
    
    if len(word_list) > 0:
        TTR = len(set(word_list))/len(word_list)
        hapaxes = list(filter(lambda x: word_list.count(x) == 1, word_list))
        HTR = len(hapaxes)/len(word_list)
        sorted_word_frequency = sorted(Counter(word_list).items(), key=itemgetter(1), reverse=True)
        max_frequency = sorted_word_frequency[0][1]
        
    df_row = pd.DataFrame({"words": num_words, "avg_letters_per_word": avg_letters_per_word, "longwords": num_longwords, "stopwords": num_stopwords, "TTR": TTR, "HTR": HTR, "max_frequency": max_frequency}, index=[0])
    df_words = df_words.append(df_row, ignore_index=True)

df_words['temp'] = 1

for i in range(df_sample['user'].count()):
    df_words['temp'][i] = i
    
df_sample = pd.merge(df_sample, df_words, on=['temp'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Extracting sentence based features

In [112]:
df_sentences = pd.DataFrame()

for index, row in df_sample.iterrows():
    sentences = list(row['doc'].sents)
    num_sentences = len(sentences)
    
    #nr_words_l = [len(s) for s in doc.sents]
    avg_num_words_per_sentence = row['words']/num_sentences
    
    if len(sentences) > 1:
        std_num_words_per_sentence = stdev([len(sentence) for sentence in sentences])
    else:
        std_nr_word_per_sent = 0

    df_row = pd.DataFrame({"sentences": num_sentences, "avg_num_words_per_sentence": avg_num_words_per_sentence, "std_num_words_per_sentence": std_num_words_per_sentence}, index=[0])
    df_sentences = df_sentences.append(df_row, ignore_index=True)
    
df_sentences['temp'] = 1

for i in range(df_sample['user'].count()):
    df_sentences['temp'][i] = i
    
df_sample = pd.merge(df_sample, df_sentences, on=['temp'])

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Extracting punctuation based features

In [113]:
df_punctuations = pd.DataFrame()

for index, row in df_sample.iterrows():
    num_dots = 0
    num_commas = 0
    num_question_marks = 0
    num_exclamations = 0
    num_hyphens = 0
    num_colons = 0
    num_semicolons = 0
    
    for word in row['doc']:
        if str(word) == ".":
            num_dots += 1
        if str(word) == ",":
            num_commas += 1
        if str(word) == "?":
            num_question_marks += 1
        if str(word) == "!":
            num_exclamations += 1
        if str(word) == "-":
            num_hyphens += 1
        if str(word) == ":":
            num_colons += 1
        if str(word) == ";":
            num_semicolons += 1
        
    df_row = pd.DataFrame({"dots": num_dots, "commas": num_commas, "question_marks": num_question_marks, "exclamations": num_exclamations, "hyphens": num_hyphens, "colons": num_colons, "semicolons": num_semicolons}, index=[0])
    df_punctuations = df_punctuations.append(df_row, ignore_index=True)
    
df_punctuations['temp'] = 1

for i in range(df_sample['user'].count()):
    df_punctuations['temp'][i] = i
    
df_sample = pd.merge(df_sample, df_punctuations, on=['temp'])


Extracting paragraph based features

In [114]:
df_paragraphs = pd.DataFrame()

for index, row in df_sample.iterrows():
    avg_sentences_per_para = 0
    avg_words_per_para = 0

    paragraphs = row['content'].split('\n\n')
    paragraphs = [paragraph for paragraph in paragraphs if not (paragraph.isspace()or paragraph == "")]
    
    avg_sentences_per_para = row['sentences']/len(paragraphs)
    avg_words_per_para = row['words']/len(paragraphs)

    df_row = pd.DataFrame({"paragraphs": len(paragraphs), "avg_sentences_per_paragrapgh": avg_sentences_per_para, "avg_words_per_paragraph": avg_words_per_para}, index=[0])
    df_paragraphs = df_paragraphs.append(df_row, ignore_index=True)

df_paragraphs['temp'] = 1

for i in range(df_sample['user'].count()):
    df_paragraphs['temp'][i] = i
    
df_sample = pd.merge(df_sample, df_paragraphs, on=['temp'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Extracting semantic based features

In [115]:
df_semantics = pd.DataFrame()

for index, row in df_sample.iterrows():
    intensity = SentimentIntensityAnalyzer()
    num_positive_words = 0
    num_negative_words = 0
    num_named_entities = 0

    for word in row['doc']:
        if (intensity.polarity_scores(str(word))['compound']) >= 0.5:
            num_positive_words += 1
        elif (intensity.polarity_scores(str(word))['compound']) <= -0.5:
            num_negative_words += 1
        
        if word.ent_type_ != "":
            num_named_entities += 1       
    
    score = intensity.polarity_scores(row['content'])['compound']
    greeting_words_list = ["Dear", "To Whom It May Concern", "Hello", "Hi"]
    num_greeting_words = 0
    for g_word in row['content'].split():
        if g_word in greeting_words_list:
            num_greeting_words += 1
    
    df_row = pd.DataFrame({"semantic_score": score, "positive_words": num_positive_words, "negative_words": num_negative_words, "named_entities": num_named_entities, "greeting_words": num_greeting_words}, index=[0])
    df_semantics = df_semantics.append(df_row, ignore_index=True)
    
df_semantics['temp'] = 1

for i in range(df_sample['user'].count()):
    df_semantics['temp'][i] = i
    
df_sample = pd.merge(df_sample, df_semantics, on=['temp'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Extracting syntactic based features

In [116]:
df_syntactic = pd.DataFrame()

for index, row in df_sample.iterrows():
    pos_list = []
    function_pos_list = ["PRON", "DET", "ADP", "CONJ", "AUX", "INTJ", "PART", "CCONJ", "PART"]
    nr_function = 0
    sum_length_np = 0
    avg_length_np = 0
    np_list = []

    for token in row['doc']:
        pos_list.append(token.pos_)
    for pos in pos_list:
        if pos in function_pos_list:
            nr_function += 1
    for np in row['doc'].noun_chunks:
        sum_length_np += len(np.text)
        np_list.append(np.text)
    if len(np_list) > 0:
        avg_length_np = sum_length_np/len(np_list)
    else:
        avg_length_np = 0

    df_row = pd.DataFrame({"nr_pos": len(set(pos_list)), "nr_function": nr_function, "avg_length_np": avg_length_np}, index=[0])
    df_syntactic = df_syntactic.append(df_row, ignore_index=True)
    
df_syntactic['temp'] = 1

for i in range(df_sample['user'].count()):
    df_syntactic['temp'][i] = i
    
df_sample = pd.merge(df_sample, df_syntactic, on=['temp'])
df_sample = df_sample.drop('temp', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [117]:
print (df_sample.head(5))

                                      Message-ID  \
0  <11669066.1075861668701.JavaMail.evans@thyme>   
1  <23478641.1075857656601.JavaMail.evans@thyme>   
2  <24148824.1075857583291.JavaMail.evans@thyme>   
3   <5634476.1075852716955.JavaMail.evans@thyme>   
4   <4918911.1075857613017.JavaMail.evans@thyme>   

                                             content      user  \
0  \nYOU HAVE ACCESS TO COME IN THE BUILDING ON S...  arnold-j   
1               can you change #23 and #375 to Nymex  arnold-j   
2  hey podner:\nwhere are you buying me dinner to...  arnold-j   
3  with?\n\n -----Original Message-----\nFrom: \t...  arnold-j   
4  fine\n\n\nFrom: Sarah Wesner/ENRON@enronXgate ...  arnold-j   

                                                 doc  chars  letters  \
0  (\n, YOU, HAVE, ACCESS, TO, COME, IN, THE, BUI...    288      238   
1  (can, you, change, #, 23, and, #, 375, to, Nymex)     29       22   
2  (hey, podner, :, \n, where, are, you, buying, ...     44       41   
3 

In [118]:
print (df_sample.columns)

Index(['Message-ID', 'content', 'user', 'doc', 'chars', 'letters',
       'upper_case', 'lower_case', 'punctuations', 'spaces', 'nums', 'words',
       'avg_letters_per_word', 'longwords', 'stopwords', 'TTR', 'HTR',
       'max_frequency', 'sentences', 'avg_num_words_per_sentence',
       'std_num_words_per_sentence', 'dots', 'commas', 'question_marks',
       'exclamations', 'hyphens', 'colons', 'semicolons', 'paragraphs',
       'avg_sentences_per_paragrapgh', 'avg_words_per_paragraph',
       'semantic_score', 'positive_words', 'negative_words', 'named_entities',
       'greeting_words', 'nr_pos', 'nr_function', 'avg_length_np'],
      dtype='object')


In [119]:
df_sample['user'] = df_sample['user'].map( {'arnold-j':1, 'bass-e':2, 'farmer-d':3, 'germany-c':4, 'jones-t':5, 'lenhart-m':6, 'lokay-m':7, 'love-p':8, 'mann-k':9, 'nemec-g':10, 'perlingiere-d':11, 'rogers-b':12, 'scott-s':13, 'symes-k':14} ).astype(int)



In [120]:
df_sample.drop(['doc', 'content', 'Message-ID'], axis=1, inplace=True)
print (df_sample.head(1))

   user  chars  letters  upper_case  lower_case  punctuations  spaces  nums  \
0     1    288      238         188          50            10      13    27   

   words  avg_letters_per_word  ...  avg_sentences_per_paragrapgh  \
0     56                  4.25  ...                           6.0   

   avg_words_per_paragraph  semantic_score  positive_words  negative_words  \
0                     28.0          0.7695               0               0   

   named_entities  greeting_words  nr_pos  nr_function  avg_length_np  
0              19               0      11           24      10.047619  

[1 rows x 36 columns]


In [121]:
df_sample.index = list(range(len(df_sample.index)))

train_idx, test_idx= train_test_split(df_sample.index, test_size=0.2, random_state=42) 
train_df = df_sample.iloc[train_idx]
test_df = df_sample.iloc[test_idx]

print (train_df.shape)
print (test_df.shape)

(112, 36)
(28, 36)


In [122]:
X = train_df.drop('user', axis=1)
y = train_df.user

print (X.shape[1])

35


In [123]:
sScaler = StandardScaler(copy=True, with_mean=True, with_std=True)

XScaled = minmax_scale((sScaler.fit_transform(X)), feature_range=(0, 1))
y_label = label_binarize(y, classes=list(range(1,13)))

In [124]:
X_test = test_df.drop('user', axis=1)
y_test = test_df.user
X_test_Scaled = minmax_scale((sScaler.fit_transform(X_test)), feature_range=(0, 1))

In [125]:
clf = SVC(kernel='rbf')
scores = cross_val_score(clf, XScaled, y_label, cv=5, scoring='roc_auc')
print("roc_auc: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

roc_auc: nan (+/- nan)


ValueError: bad input shape (89, 12)

ValueError: bad input shape (89, 12)

ValueError: bad input shape (90, 12)

ValueError: bad input shape (90, 12)

ValueError: bad input shape (90, 12)



In [83]:
svc = SVC(kernel='rbf')
svc.fit(X, y)
y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_test, y_test) * 100, 5)
acc_svc

7.14286

In [90]:
clf = KNeighborsClassifier(n_neighbors = 50,algorithm='ball_tree')
scores = cross_val_score(clf, XScaled, y_label, cv=5, scoring='roc_auc')
print("roc_auc: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
knn = KNeighborsClassifier(n_neighbors = 100,weights ='distance')
knn.fit(X, y)
y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_test, y_test) * 100, 5)
acc_knn

In [None]:
clf = RandomForestClassifier(n_estimators=100) 
scores = cross_val_score(clf, X, y_label, cv=5, scoring='roc_auc')
print("roc_auc: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X, y)
Y_pred = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 5)
acc_random_forest