# Enron email analysis

The Enron email dataset contains approximately 500,000 emails generated by employees of the Enron Corporation. It was obtained by the Federal Energy Regulatory Commission during its investigation of Enron's collapse.

In this example, we will use the first 1000 rows of email to perform sentiment analysis

Data source: https://www.kaggle.com/datasets/wcukierski/enron-email-dataset 

In [1]:
import os
import sys
import pandas as pd
import nltk
import spacy
import string
import email
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# download spacy package
#!python -m spacy download en

In [3]:
# outputs
cols = ['subject', 'sender', 'receiver', 'body', 'tot_len', 'positive words', 'positive counts', 'negative words', 'negative counts']

output_name_xls ='enron_email_clean.xlsx'
output_name_csv ='enron_email_clean.csv'

# total numbers of email, specify None if want to read whole file
read_rows = 1000 

In [4]:
# read emails data
data = pd.read_csv("data/emails.csv", encoding = "ISO-8859-1",  delimiter=',', nrows = read_rows) 

In [5]:
data.shape

(1000, 2)

In [6]:
# create list of email objects
emails = list(map(email.parser.Parser().parsestr,data['message']))

# extract headings (eg: from, subject)
headings  = emails[0].keys()

# For each email, extract info for each key
for x in headings:
    data[x] = [doc[x] for doc in emails]

In [7]:
# check data
data.head(5)

Unnamed: 0,file,message,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf


In [8]:
# load spacy
spacy_nlp = spacy.load('en_core_web_sm')

# call the VADER object
analyser = SentimentIntensityAnalyzer()

# Functions

### Identify sender and receiver

In [9]:
# identify Sender
def get_sender_name(sender):

    # get the first name and the full name
    if sender.find("@") > 0 :
        name = sender.split("@")[0]
        if name.find(".") > 0:
            sendername = name.split(".")[0] + " " + name.split(".")[1]
        else:
            sendername = name
    else:
        sendername = sender
    return sendername

In [10]:
# identify receiver
def get_receiver_name(receiver):
    
    try:

        # if there are multiple receiver
        if receiver.find(",") > 0:
            multi_receiver = receiver.split(",")
            receivername = ""
            for name in multi_receiver:
                if name.find("@") > 0:
                    name = name.split("@")[0]
                    
                    if name.find(".") > 0:
                        rname = name.split(".")[0] + " " + name.split(".")[1]
                    else:
                        rname = name
                else:
                    if name.find(".") > 0:
                        rname = name.split(".")[0] + " " + name.split(".")[1]
                    else:
                        rname = name.split("@")[0]
                receivername = receivername + rname + ", "
            receivername = receivername[:-2]
        else:
            # get the full name
            if receiver.find("@") > 0 :
                name = receiver.split("@")[0]
                
                if name.find(".") > 0:
                    receivername = name.split(".")[0] + " " + name.split(".")[1]
                else:
                    receivername = name
            else:
                receivername = receiver
    except:
        receivername = receiver
        
    return receivername

### Extract body content

In [11]:
# get body content
def get_body_text(emails):
    email_text = []
    for email in emails.walk():
        if email.get_content_type() == 'text/plain':
            email_text.append(email.get_payload())
    return ''.join(email_text)

### Remove signature 

In [12]:
def remove_sign(body, sender_name):
    # if the sender's first name is in the body, remove it
    length_fnd = body.find(sender_name)
    if length_fnd > 0 :
        body_new = body[:length_fnd]
    else:
        body_new = body
    return body_new

### Remove entities

In [13]:
# remove entities
def remove_entity(nlpdoc):
    doc_noentities = []

    ents = [e.text for e in nlpdoc.ents]
    newString = body_new
    for e in reversed(nlpdoc.ents): #reversed to not modify the offsets of other entities when substituting
        start = e.start_char
        end = start + len(e.text)
        newString = newString[:start] + '' + newString[end:]

    return newString

# Preprocessing
Spacy? NLTK? BeautifulSoup for html? <br>
see: http://ai.intelligentonlinetools.com/ml/sentiment-analysis 

Removal
- remove entity (person, city names, geographical places
- remove stopwords (I, is, you, we, for, and etc)
- remove punctuation (!,_.?#;)
- remove spaces 
- lemmatization
https://medium.com/@makcedward/nlp-pipeline-stop-words-part-5-d6770df8a936

In [14]:
# Extract body from the email
data['Body'] = list(map(get_body_text, emails))

In [15]:
email_data_list = []

for ind, email in data.iterrows():
    # email data
    subject = email["Subject"]
    body = email["Body"]
    sender = email["From"]
    receiver = email["To"]
    
    # identify sender and receiver
    sender_name = get_sender_name(sender)
    receiver_name = get_receiver_name(receiver)
    

    # body text
    # remove signature
    body_new = remove_sign(body, sender_name)
    
    doc = spacy_nlp(body_new)
    
    # remove entities
    body_remove_ent = remove_entity(doc)
    
    doc = spacy_nlp(body_remove_ent)
    
    # get total length of the body
    tot_len = 0
    tot_len =len(doc)
    
    # lemmatization, remove stop words, spaces and punctuation
    # token.text = original text, token.lemma_ = lemmatization
    tokens = [token.lemma_ for token in doc if not token.is_stop | token.is_punct | token.is_space]
    
    # extract positive and negative words
    pos_word = ""
    neu_word = ""
    neg_word = ""
    pos = 0
    neu = 0
    neg = 0


    # scores each word
    for word in tokens:
        if(analyser.polarity_scores(word)['compound']) >= 0.05:
            pos_word = pos_word + word + ", "
            pos = pos + 1
        elif(analyser.polarity_scores(word)['compound']) <= -0.05:
            neg_word = neg_word + word + ", "
            neg = neg + 1
        else:
            neu_word = neu_word + word + ", "
            neu = neu + 1
        
            
    # append results
    email_data_list.append([subject, sender_name, receiver_name, body_new, tot_len, pos_word, pos, neg_word, neg])
    
#dataframe
email_data_df = pd.DataFrame(email_data_list, columns=cols)

In [22]:
# calculate the sentiment score using VADER
polarity = [round(analyser.polarity_scores(i)['compound'], 2) for i in email_data_df['body']]
email_data_df['sentiment_score'] = polarity

# determine overall sentiment based on the score
email_data_df['overall_sentiment'] = 'Neutral'
email_data_df.loc[email_data_df['sentiment_score'] > 0, 'overall_sentiment'] = 'Positive'
email_data_df.loc[email_data_df['sentiment_score'] < 0, 'overall_sentiment'] = 'Negative'

In [23]:
# check cleaned data 
email_data_df.head(5)

Unnamed: 0,subject,sender,receiver,body,tot_len,positive words,positive counts,negative words,negative counts,sentiment_score,overall_sentiment
0,,phillip allen,tim belden,Here is our forecast\n\n,5,,0,,0,0.0,Neutral
1,Re:,phillip allen,john lavorato,Traveling to have a business meeting takes the...,160,"fun, honest, desire, stimulate, well, play,",6,,0,0.93,Positive
2,Re: test,phillip allen,leah arsdall,test successful. way to go!!!,10,"successful,",1,,0,0.69,Positive
3,,phillip allen,randall gay,"Randy,\n\n Can you send me a schedule of the s...",41,,0,,0,0.0,Neutral
4,Re: Hello,phillip allen,greg piper,Let's shoot for Tuesday at 11:45.,8,,0,"shoot,",1,-0.34,Negative


### save results into excel or csv

In [24]:
email_data_df.to_excel('output/' + output_name_xls , index=False)
email_data_df.to_csv('output/' + output_name_csv, index=False)