Download the enron dataset

In [None]:
print("Downloading the Enron dataset (this may take a while)")
print("To check on progress, you can cd up one level, then execute <ls -lthr>")
print("Enron dataset should be last item on the list, along with its current size")
print("Download will complete at about 1.82 GB")

import requests
url = "https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tar.gz"
filename = "../enron_mail_20150507.tar.gz"
with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)
print("Download Complete!")

print("Unzipping Enron dataset (This may take a while)")
import tarfile
tfile = tarfile.open("../enron_mail_20150507.tar.gz")
tfile.extractall(".")
tfile.close()

print("You're ready to go!")

Downloading the Enron dataset (this may take a while)
To check on progress, you can cd up one level, then execute <ls -lthr>
Enron dataset should be last item on the list, along with its current size
Download will complete at about 1.82 GB


Convert all enron email files to csv

In [1]:
import os
import csv

In [2]:
# Define the root directory of the Enron dataset
MAILDIR_PATH = "./maildir" 
OUTPUT_CSV = "enron_emails.csv"

In [3]:
# Define the CSV columns
fields = ["file", "message"]

In [4]:
# Function to extract email content from a file
def extract_email(file_path):
    with open(file_path, "r", encoding="latin1") as file:
        data = file.read()
    relative_path = os.path.relpath(file_path, MAILDIR_PATH).replace(MAILDIR_PATH + '/', '')
    email_message = email.message_from_string(data)
    return {
        "file": relative_path,
        "message": email_message.as_string()
    }

# List to store the email data
all_emails = []

# Walk through the directories and extract emails
for root, dirs, files in os.walk(MAILDIR_PATH):
    for file in files:
        file_path = os.path.join(root, file)
        try:
            email_content = extract_email(file_path)
            all_emails.append(email_content)
        except Exception as e:
            print(f"Failed to extract {file_path}: {e}")

# Write the emails to a CSV file
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csv_file:
    csv_writer = csv.DictWriter(csv_file, fieldnames=fields)
    csv_writer.writeheader()
    csv_writer.writerows(all_emails)

Emails have been successfully written to enron_emails.csv


In [20]:
# import needed libraries
import numpy as np
import string
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
import email
import re
from datetime import datetime
from dateutil import tz
import networkx as nx
import nltk
import wordcloud
import plotly.express as px
from plotly import graph_objects as go

# Machine learning and NLP libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
import scipy as sp
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [21]:
# Sentiment Analysis
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nrclex import NRCLex

Loading data

In [22]:
df = pd.read_csv(r"enron_emails.csv")

In [23]:
df.head()

Unnamed: 0,file,message
0,/Users/liqi/Desktop/IND5003/5003grp/maildir/.D...,
1,/Users/liqi/Desktop/IND5003/5003grp/maildir/ar...,Message-ID: <17334447.1075857585446.JavaMail.e...
2,/Users/liqi/Desktop/IND5003/5003grp/maildir/ar...,Message-ID: <19171686.1075857585034.JavaMail.e...
3,/Users/liqi/Desktop/IND5003/5003grp/maildir/ar...,Message-ID: <29887033.1075857630725.JavaMail.e...
4,/Users/liqi/Desktop/IND5003/5003grp/maildir/ar...,Message-ID: <29084893.1075849630138.JavaMail.e...


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517403 entries, 0 to 517402
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   file        517403 non-null  object
 1   message     517403 non-null  object
 2   Date        517401 non-null  object
 3   From        517401 non-null  object
 4   To          495554 non-null  object
 5   Subject     517401 non-null  object
 6   X-From      517372 non-null  object
 7   X-To        517372 non-null  object
 8   X-Folder    517372 non-null  object
 9   X-Origin    517372 non-null  object
 10  X-Filename  517372 non-null  object
dtypes: object(11)
memory usage: 43.4+ MB


Data cleaning and preparing

In [28]:
# Write functions to extract mail headers and body
def extractmailitems(field, msg):
    elst = []
    
    for i, message in (msg.items()):
        e = email.message_from_string(message)
        elst.append(e.get(field))
       
    return elst

In [30]:
columns = ['Date', 'From', 'To', 'Subject', 'X-From', 'X-To', 'X-Folder', 'X-Origin', 'X-Filename']
df['message'] = df['message'].astype(str)  # Convert messages to strings
for i in columns:
    df[i] = extractmailitems(i, df['message'])

In [32]:
def body(col):
    bodycolumn = []
    for message in col.values:
        e = email.message_from_string(message)
        bodycolumn.append(e.get_payload())
 
    return bodycolumn
df['Body'] = body(df['message'])

In [33]:
# extract emplyees' names from file column
def employees(files):
    employees = []
    for i, employee in files.items():
        employee = employee.split('/')[0]
        employees.append(employee)
    return employees
df['Employee'] = employees(df['file'])

In [34]:
df.sample(5)

Unnamed: 0,file,message,Date,From,To,Subject,X-From,X-To,X-Folder,X-Origin,X-Filename,Body,Employee
482313,/Users/liqi/Desktop/IND5003/5003grp/maildir/ke...,Message-ID: <11621426.1075855427794.JavaMail.e...,"Thu, 20 Dec 2001 08:51:28 -0800 (PST)",chris.behney@enron.com,mark.pickering@enron.com,RE: email problem,"Behney, Chris </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Pickering, Mark </O=ENRON/OU=NA/CN=RECIPIENTS/...","\Steven_Kean_Jan2002\Kean, Steven J.\junk mail",Kean-S,skean (Non-Privileged).pst,Steve is set to unlimited space now. He and hi...,
291686,/Users/liqi/Desktop/IND5003/5003grp/maildir/ma...,Message-ID: <24481601.1075845583729.JavaMail.e...,"Tue, 18 Jul 2000 08:32:00 -0700 (PDT)",aduncan@kilstock.com,jeffrey.m.keenan@enron.com,CPCN and followup,"""Duncan, Allyson"" <aduncan@kilstock.com>","""'Keenan Jeffrey'"" <Jeffrey.M.Keenan@enron.com>",\Kay_Mann_June2001_1\Notes Folders\All documents,MANN-K,kmann.nsf,"Jeffrey, after our discussion yesterday regard...",
35680,/Users/liqi/Desktop/IND5003/5003grp/maildir/ka...,Message-ID: <25824784.1075856373727.JavaMail.e...,"Mon, 27 Nov 2000 03:34:00 -0800 (PST)",vince.kaminski@enron.com,vkaminski@aol.com,Btu Weekly,Vince J Kaminski,vkaminski@aol.com,\Vincent_Kaminski_Jun2001_2\Notes Folders\Disc...,Kaminski-V,vkamins.nsf,---------------------- Forwarded by Vince J Ka...,
488574,/Users/liqi/Desktop/IND5003/5003grp/maildir/ke...,Message-ID: <30286700.1075846190439.JavaMail.e...,"Wed, 8 Nov 2000 01:16:00 -0800 (PST)",steven.kean@enron.com,maureen.mcvicker@enron.com,"Order issued by FERC , EL00-95-000",Steven J Kean,Maureen McVicker,\Steven_Kean_Dec2000_1\Notes Folders\All docum...,KEAN-S,skean.nsf,print\n----- Forwarded by Steven J Kean/NA/Enr...,
121540,/Users/liqi/Desktop/IND5003/5003grp/maildir/ri...,Message-ID: <29790296.1075858647879.JavaMail.e...,"Mon, 22 Oct 2001 21:00:26 -0700 (PDT)",no.address@enron.com,,New Link for All-Employee Meeting,Public Relations@ENRON,All Enron Worldwide@ENRON,\RRING (Non-Privileged)\Deleted Items,Ring-R,RRING (Non-Privileged)1.pst,Attached is a new link for employees unable to...,


In [35]:
# drop unimportant data columns and empty cells
df.drop(columns = ['file', 'message'], inplace=True)
df.dropna(axis = 0, inplace=True)
df.sample(5)

Unnamed: 0,Date,From,To,Subject,X-From,X-To,X-Folder,X-Origin,X-Filename,Body,Employee
357946,"Mon, 26 Nov 2001 13:44:15 -0800 (PST)",djcustomclips@djinteractive.com,1529@wctopics.djnr.com,Enron Corp.: CORRECT:Enron Employees' Lawyer B...,djcustomclips@djinteractive.com,1529@WCTOPICS.djnr.com,"\RSHAPIRO (Non-Privileged)\Shapiro, Richard\De...",Shapiro-R,RSHAPIRO (Non-Privileged).pst,"CORRECT:Enron Employees' Lawyer Based In D.C.,...",
36793,"Tue, 22 Feb 2000 18:37:00 -0800 (PST)",vince.kaminski@enron.com,vkaminski@aol.com,"March 2, NYMEX night at the rodeo",Vince J Kaminski,vkaminski@aol.com,\Vincent_Kaminski_Jun2001_7\Notes Folders\Disc...,Kaminski-V,vkamins.nsf,---------------------- Forwarded by Vince J Ka...,
481673,"Tue, 26 Sep 2000 06:32:00 -0700 (PDT)",maureen.mcvicker@enron.com,katherine.brown@enron.com,"Re: EXECUTIVE COMMITTEE MEETING - MONDAY, OCTO...",Maureen McVicker,Katherine Brown,\Steven_Kean_Dec2000_1\Notes Folders\Archiving...,KEAN-S,skean.nsf,STEVE KEAN\n\n\n\n\n\n\tKatherine Brown\n\t09/...,
5315,"Tue, 20 Nov 2001 13:47:13 -0800 (PST)",scott.tholan@enron.com,john.lavorato@enron.com,Resend - Competitive Analysis,"Tholan, Scott </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","Lavorato, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","\JLAVORA (Non-Privileged)\Lavorato, John\Delet...",Lavorato-J,JLAVORA (Non-Privileged).pst,"John,\nAttached is my prioritized recommendati...",
233697,"Fri, 19 Oct 2001 13:51:20 -0700 (PDT)",mark.whitt@enron.com,"steve.walton@enron.com, paul.kaufman@enron.com...",Buffalo Power Company LLC - Gas Turbine Power ...,"Whitt, Mark </O=ENRON/OU=NA/CN=RECIPIENTS/CN=M...","Walton, Steve </O=ENRON/OU=NA/CN=RECIPIENTS/CN...","\BTYCHOL (Non-Privileged)\Tycholiz, Barry\Dele...",TYCHOLIZ-B,BTYCHOL (Non-Privileged).pst,ENA and Crestone (Northern Border's non-regula...,


In [36]:
df.Employee.value_counts()[:20]

Employee
    495547
Name: count, dtype: int64

In [37]:
# changing the date format and type from string to date object
df['Date'] = pd.to_datetime(df['Date'], utc = True, dayfirst = True)

  df['Date'] = pd.to_datetime(df['Date'], utc = True, dayfirst = True)


In [None]:
# creating a year column from the date column
df['Year'] = pd.DatetimeIndex(df['Date']).year

In [None]:
df.Year.value_counts()

In [None]:
# a function for cleaning text in columns

def extract(col):
    regcol = []
    for sent in col.values:
        
        if str(sent).startswith('<'):  
            reg = re.split(r'@|\(', str(sent))[0]
            reg = re.findall(r'[a-zA-Z]+\'?-?', str(reg))           
            
        elif re.match(r'^\d+', str(sent)):
            reg = re.split(r'@|\(', str(sent))[0]
            reg = re.findall(r'[0-9]+\'?-?', str(reg))
           
        else:
            reg = re.split(r'@|<|\(', str(sent))[0]
            reg = re.findall(r'[a-zA-Z]+\'?-?', str(reg))
            
        reg = re.sub(r'[\'\",]', '', str(reg))
        regcol.append(str(reg.strip('[]')))
           
            
    return regcol

In [None]:
df['X-From'] = extract(df['X-From'])
df['X-To'] = extract(df['X-To'])

In [None]:
df['X-From'].value_counts()[:20]

In [None]:
df['X-To'].value_counts()[:20]

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
# make a diagram to show the count of messages sent per year
ax = df.groupby(df.Year)['X-Origin'].count().plot(figsize = (8,6))
ax.set_xlim(1995,2005)
ax.set_xlabel('Year', fontsize = 16)
ax.set_ylabel('Count', fontsize = 16)
ax.set_title('Messages sent across the years', fontsize = 16)
plt.show()

In [None]:
# diagram for showing top senders
plt.figure(figsize = (16,10))
ax = sns.barplot(x = df['X-Origin'].value_counts().values[:30], y = df['X-Origin'].value_counts().keys()[:30] ,palette = 'rocket', orient = 'h')
ax.set_xlabel('Count', fontsize = 18)
ax.set_ylabel('Original Senders', fontsize = 18)
ax.set_title('Top 30 Senders', fontsize = 20)
plt.show()

In [None]:
# diagram showing top recepients
plt.figure(figsize = (16,10))
ax = sns.barplot(x = df['To'].value_counts().values[:30], y = df['To'].value_counts().keys()[:30] ,palette = 'crest', orient = 'h')
ax.set_xlabel('Count', fontsize = 18)
ax.set_ylabel('Recepient E-Mail', fontsize = 18)
ax.set_title('Top 30 Recepient E-Mails', fontsize = 20)
plt.show()

In [None]:
userlist = df['X-Origin'].unique()

In [None]:
useryears = []
for user in userlist:
    year = df.loc[df['X-Origin'] == user, 'Year'].iloc[0]
    useryears.append(year)
    useryears

In [None]:
usercount = []
counter = 0
for name in userlist:
    for user in df['X-Origin'].values:
        if user == name:
            counter += 1
    usercount.append(counter)
    counter = 0

In [None]:
dataf = sorted(list(zip(userlist, usercount, useryears)), key= lambda user: user[1], reverse = True)[:30]

In [None]:
dataf = pd.DataFrame(dataf)
dataf.columns = ['User', 'Count', 'Year']
dataf.head()

In [None]:
# diagram showing top users and count of mails sent per year
plt.figure(figsize = (20,16))
ax = sns.barplot(data = dataf, x ='User', y = 'Count', hue = 'Year', palette = 'viridis' , saturation = 0.7, width = 1.5)
sns.despine()
plt.xlabel('User', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.title('User sent mails/year', fontsize = 22)
plt.xticks(rotation = 45, fontsize = 12)
plt.yticks(fontsize = 12)
plt.legend(fontsize = 14)
plt.show()

In [None]:
# a network of first 2500 rows showing the network of Allen P
plt.figure(figsize = (20, 20))
G = nx.from_pandas_edgelist(df[:2500], 'X-Origin', 'X-To')
pos = nx.draw_random(G, node_size = 50, node_color = 'blue', edge_color = 'salmon', with_labels = True)
plt.title('Network of Emails (First 2500)', fontsize = 24)
plt.show()

**Machine Learning and NLP on the dataset**

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
newstopwords = ['Re', 'FW', 'Fwd', 'EOL', 'E', 'mail', 'PLEASE', 'Ahead']
for i in newstopwords:
    stopwords.append(i)

In [None]:
subjects = ' '.join(df['Subject'].values)
fig, ax = plt.subplots(figsize=(14, 10))
wc = wordcloud.WordCloud(width = 800, height = 600, max_words = 200, stopwords = stopwords).generate(subjects)
ax.imshow(wc)
plt.axis('off')
plt.show()

In [None]:
content = ' '.join(df['Body'].sample(5000).values)
fig, ax = plt.subplots(figsize=(14, 10))
wc = wordcloud.WordCloud(width = 800, height = 600, max_words = 300, stopwords = stopwords).generate(content)
ax.imshow(wc)
plt.axis('off')
plt.show()

In [None]:
nltk.download('wordnet')

In [None]:
nltk.download('punkt')

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
# function for cleaning the body text
def cleaningbody(col):
    msgcol = []
    for msg in col.values:
        msg = re.sub(r'[<>\n+\t+\s+\*]', ' ', msg)
        msg = re.sub(r'[0-9]+[a-zA-Z]+\d+[?!].DOC', ' ', msg)
        msg = re.sub(r'[?\s+\-+\s+?_=~]', ' ', msg)
        msg = re.sub(r' +', ' ', msg)
        msg = msg.lower().strip(' ')
        msgcol.append(msg)
    return msgcol
df['Body'] = cleaningbody(df['Body'])

In [None]:
# preparing the lemmatization function
lemmatizer = WordNetLemmatizer()

In [None]:
# Tokenizing and lemmatizing the text to prepare for classification and sentiment analysis
text = []
for msg in df['Body'].values:
    msg_tokens = word_tokenize(msg)
    msg_tokens = [token.lower() for token in msg_tokens if token.isalpha()]
    msg_tokens = [word for word in msg_tokens if not word in stopwords]
    msg_tokens = [lemmatizer.lemmatize(word) for word in msg_tokens]
    
    text.append(msg_tokens)

In [None]:
text = [' '.join(message) for message in text]

In [None]:
# vectorizing the data using Tfidfvectorizer
vectorizer = TfidfVectorizer(min_df = 5, max_features = 5000, stop_words = stopwords, norm = 'l1')
data = vectorizer.fit_transform(text)

In [None]:
# Normalizing the data
data_norm = normalize(data)

In [None]:
print(data_norm.shape)

In [None]:
# Decomposition of the data and decreasing alot of features
svd = TruncatedSVD(n_components = 2, n_iter = 10, random_state = 42)
datasvd = svd.fit_transform(data_norm)

In [None]:
datasvd.shape

In [None]:
# Use the Elbow method to define the optimal number of clusters for kmeans clustering
max_iter = 1000
sumsquares = []
number_clusters = range(1,11)
for i in number_clusters:
    kmeans = KMeans(n_clusters = i, max_iter = max_iter, n_init = 'auto')
    kmeans.fit(datasvd)
    sumsquares.append(kmeans.inertia_)
plt.figure(figsize = (8,6))
plt.plot(number_clusters, sumsquares)
plt.xlabel('Clusters', fontsize = 14)
plt.ylabel('Sum of Squared Distances', fontsize = 14)
plt.title('Elbow Method', fontsize = 16)
plt.show()

In [None]:
n_clusters = 6
clf = KMeans(n_clusters = n_clusters,init = 'random', max_iter = max_iter, tol = 0.0001, algorithm = 'lloyd', n_init = 'auto', random_state = 42)
fittedkmeans = clf.fit_predict(datasvd)
centroids = clf.cluster_centers_

In [None]:
# A diagram showing the clusters
plt.figure(figsize = (8,6))
plt.scatter(datasvd[:,0], datasvd[:,1], c = fittedkmeans, s = 50, cmap = 'viridis', alpha = 0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], s = 150, c = 'black', alpha = 0.8)
plt.annotate('Cluster 0', xy = (centroids[0][0], centroids[0][1]), xytext = (centroids[0][0] + 0.02, centroids[0][1] + 0.03), color = 'white', fontsize = 12)
plt.annotate('Cluster 1', xy = (centroids[1][0], centroids[1][1]), xytext = (centroids[1][0] + 0.02, centroids[1][1] + 0.02), color = 'white', fontsize = 12)
plt.annotate('Cluster 2', xy = (centroids[2][0], centroids[2][1]), xytext = (centroids[2][0] - 0.05, centroids[2][1] - 0.06), color = 'white', fontsize = 12)
plt.annotate('Cluster 3', xy = (centroids[3][0], centroids[3][1]), xytext = (centroids[3][0] - 0.01, centroids[3][1] - 0.07), color = 'white', fontsize = 12)
plt.annotate('Cluster 4', xy = (centroids[4][0], centroids[4][1]), xytext = (centroids[4][0] - 0.03, centroids[4][1] + 0.04), color = 'white', fontsize = 12)
plt.annotate('Cluster 5', xy = (centroids[5][0], centroids[5][1]), xytext = (centroids[5][0] - 0.03, centroids[5][1] + 0.03), color = 'white', fontsize = 12)

plt.show()

In [None]:
# extract top words in every cluster using the inverse_transform method
original_space_centroids = svd.inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1] #(10,5000)
terms = vectorizer.get_feature_names_out()

for i in range(n_clusters):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :20]:
        print(f"{terms[ind]} ", end="")
        
    print()

***Sentiment Analysis***

In [None]:
# praeparing the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
df['sentiment scores'] = [analyzer.polarity_scores(message) for message in text]
df.sample(5)

In [None]:
# Determine the positive and negative scores
sentiment = []
for i in df['sentiment scores'].values:
    if i['compound'] > 0.05:
        sentiment.append('Positive')
    elif i['compound'] < 0.05:
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
df['Sentiment'] = sentiment

In [None]:
sentiment_values = df['Sentiment'].value_counts()
sentiment_values

In [None]:
# a graph for the sentiment as a funnel shaped graph
fig = px.funnel(sentiment_values)
fig.show()

In [None]:
# defining another function for sentiment analysis with different emotions using NRCLex lexicon
def sentiment(message):
    text = NRCLex(message)
    if text.top_emotions[0][1] == 0.0:
        return 'No Emotion'
    else:
        return text.top_emotions[0][0]
df['Emotions'] =  df['Body'].apply(sentiment)
df.sample(5)

In [None]:
emotions = df['Emotions'].value_counts()
emotion_chart = pd.DataFrame(emotions)
emotion_chart = emotion_chart.drop('No Emotion', axis = 0)
emotion_chart

In [None]:
# A pie chart for showing the percentage of every emotion in the text
labels = emotion_chart.index.tolist()
plt.figure(figsize = (11,11))
plt.pie(emotion_chart['Emotions'].values, labels = labels , autopct = '%1.1f%%', labeldistance= 1.1)
plt.title('Emotions', fontsize = 14)
plt.legend(loc = 'upper right')
plt.show()