In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import email
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english') 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("/gdrive/My Drive/AI Email Classifier/emails.csv")

Extracting Catagories

In [5]:
catagories = []
for name in data['file']:
  name=name.split('/')
  catagories.append(name[1])

data['catagory'] = catagories
labels = list(data['catagory'].unique())

print(data['catagory'].value_counts())

all_documents          128103
discussion_threads      58609
sent                    57653
deleted_items           51356
inbox                   44859
                        ...  
bush_administration         1
volume_management           1
chinesewallspolicy          1
fundies                     1
sonoco                      1
Name: catagory, Length: 1427, dtype: int64


In [11]:
selected_catagories =  ['discussion_thread', 'personal', 'meetings', 'logistics', 'calendar', 'archiving', 'california', 'power', 'deal_communication', 'resumes']

data = data[data['catagory'].isin(selected_catagories)]

Extracting Message Body and Headers

In [12]:
#mail body
email_body = []

for mail in data['message']:
  mail = email.message_from_string(mail)  

  # getting message body  
  message_body = mail.get_payload()
  
  email_body.append(message_body)

data['message_body'] = email_body


#getting headers

headers = {"Date":[], "Subject":[], "X-Folder":[], "X-From":[], "X-To":[]}
for mail in data['message']:
  mail = email.message_from_string(mail)  

  #get other headers
  for header in headers.keys():
    headers[header].append(mail.get(header))

for key in headers.keys():
  data[key] = headers[key]
data['Date'] = pd.to_datetime(data['Date'])

print(data.head())

                      file  ...                                               X-To
6345  arnold-j/personal/1.  ...  "jennifer.medcalf@enron.com" <jennifer.medcalf...
6346  arnold-j/personal/2.  ...                                   Jennifer Medcalf
6347  arnold-j/personal/3.  ...                         jennifer.medcalf@enron.com
6348  arnold-j/personal/4.  ...                                   Jennifer Medcalf
6349  arnold-j/personal/5.  ...                                   Jennifer Medcalf

[5 rows x 9 columns]


Dropping empty rows

In [13]:
print(data.shape)

(19642, 9)


In [14]:
data.dropna(inplace = True)
print(data.shape)

(19623, 9)


Text cleaning


In [15]:
#Removing non alphanumeric characters
def clean_text(text):
  cleanText = text.lower()
  cleanText = re.sub(r'[\W\d]', " ", cleanText)  

  return cleanText

#Removing stop words
def stop_word_removal(text):
  tokens = [token for token in text if token not in stopwords]
  return tokens

#Tokenizing
def tokenize(text):
  tokens = text.split(" ")
  return tokens

headers = ["Subject", "X-Folder", "X-From", "X-To", "message_body"]

tokens = [] 

for i in range(data.shape[0]):
  tokens_i = []
  for header in headers:
    tokens_h = clean_text(data[header].values[i])
    tokens_h = tokenize(tokens_h)
    tokens_h = stop_word_removal(tokens_h)

    tokens_i.extend(tokens_h)

  tokens.append(tokens_i)

In [16]:
for i in range(len(tokens)):
  tokens[i].remove("")
  tokens[i] = " ".join(tokens[i])

data['final_text'] = tokens

print(data['final_text'])

6345      dell order confirmation john_arnold_nov     no...
6346      brown bag thank john_arnold_nov     notes fold...
6347      dell online order john_arnold_nov     notes fo...
6348      update  attendees brown bags                jo...
6349      confirmation order          john_arnold_nov   ...
                                ...                        
516848    duke westcoast transaction exmerge   zufferli ...
516849    duke westcoast transaction exmerge   zufferli ...
516850    updated edcc ecc pricing discussion  exmerge  ...
516851    aes project  tolling interest  exmerge   zuffe...
516852    transfer enron direct contracts ed  marking in...
Name: final_text, Length: 19623, dtype: object


Applying Train-Test split

In [17]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(data['final_text'], data['catagory'], test_size=0.3)

Encoding data catagories

In [18]:
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

Applying TF-IDF vectorizer on final text

In [19]:
Tfidf_vect = TfidfVectorizer()

Tfidf_vect.fit(data['final_text'])

x_train = Tfidf_vect.transform(x_train)
x_test = Tfidf_vect.transform(x_test)

Applying Naive Bayes model

In [20]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(x_train, y_train)

predictions_NB = Naive.predict(x_test)
print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_NB, y_test)*100)

Naive Bayes Accuracy Score ->  58.077119075930014


Applying SVM

In [21]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(x_train, y_train)

predictions_SVM = SVM.predict(x_test)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score ->  87.42993035501954
