################################################################################################################################
#                     Mid Term: Author-Vaishali Lambe, NUID-001286444                         #
################################################################################################################################

# Question 1:

**Analysis 3**
#### Who emailed who the most?

################################################################################################################################

In [1]:
# Import the modules we need.
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import datetime
from glob import glob
import json
import nltk
from nltk import FreqDist
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
import os
import re
import requests
import string
import sys
from timeit import default_timer as timer

# Path to the raw data.
raw_data_path = os.path.expanduser("~/midterm/data/enron/maildir")
print(raw_data_path)

# Path to the preprocessed data file (may or may not exist).
preprocessed_data_path = os.path.expanduser("~/midterm/data/enron/preprocessed_analysis3.json")
print(preprocessed_data_path)

# Check to see if the file is already there.
def have_preprocessed_data():
    return os.path.isfile(preprocessed_data_path)

print("Preprocessed data? " + str(have_preprocessed_data()))
    
# Make sure the stopwords corpus is available.
nltk.download('stopwords')

C:\Users\Admin/midterm/data/enron/maildir
C:\Users\Admin/midterm/data/enron/preprocessed_analysis3.json
Preprocessed data? True
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Get the home directories in the Enron mail dump.
def get_enron_home_dirs():
    
    # Save the working directory (for later restoration).
    saved_path = os.getcwd()
    print(saved_path)

    # Get the paths to the data files.
    # Use the contents of the directory as a way to get the usernames.
    os.chdir(raw_data_path)
    result = glob('*')

    # Restore the working directory.
    os.chdir(saved_path)
    print(os.getcwd())

    print("found " + str(len(result)) + " home directories:")
    print(result[0:50])

    return result

# Find the recipients for a given piece of email.
def find_recipients(filepath):
    if not os.path.isfile(filepath):
        return []
    
    with open(os.path.join(raw_data_path, filepath), mode="rt", encoding="ISO-8859-1") as f:
        # There's a blank line between the headers and the body of the email.
        # Start in the headers.
        for line in f:
            trimmed = line.strip().lower()
            if trimmed == '':
                # Now we are in the body of the email.
                break
            
            # We've got a header line: check to see if it's the X-To: line.
            if trimmed.startswith('x-to:'):
                x_to_list = trimmed.replace('x-to:','',1).split(',')
                trimmed_list = list(map(lambda x : x.strip(), x_to_list))
                # print("Found recipients: " + str(trimmed_list))
                return trimmed_list

    return []

In [3]:
# Find the people that this person emailed the most.
def find_email_buddies(u, verbose=False):
    if verbose:
        print("finding email buddies for: " + u)
    
    home_path = os.path.join(raw_data_path, u)
    if verbose:
        print(home_path)
    
    # Save the current working directory.
    saved_wd = os.getcwd()
    
    os.chdir(home_path)
    sent_items_list = glob("sent/*")
    sent_items_list.extend(glob("sent_items/*"))
    
    # Restore the current working directory.
    os.chdir(saved_wd)

    if verbose:
        print(sent_items_list[0:10])
    
    # Process each document.
    result = []
    count = len(sent_items_list)
    if verbose:
        print("{name} sent {count} pieces of email".format(name=u, count=count))
    for f in sent_items_list:
        item_recipients = find_recipients(os.path.join(home_path, f))
        result.extend(item_recipients)
    
    recipient_count = len(result)
    if verbose:
        print("{name} sent email to {count} recipients".format(name=u, count=recipient_count))

    recipient_set = set(result)
    unique_recipient_count = len(recipient_set)
    if verbose:
        print("{name} sent email to {count} unique recipients".format(name=u, count=unique_recipient_count))

    summary = {'name':u, 'sent-mail':count, 'recipients':recipient_count, 'unique-recipients':unique_recipient_count}
    
    # It's useful to always print this one as a progress check.
    print(summary)
    
    return summary

In [4]:
# Create the email recipient list.
def create_email_recipient_list():
    result = []
    enron_home_dirs = get_enron_home_dirs()
    for u in enron_home_dirs:
        summary = find_email_buddies(u)
        result.append(summary)
    
    return result

# Restore from a preprocessed file.
def restore_email_recipient_list():
    print("Restoring from: " + preprocessed_data_path)

    result = []
    with open(preprocessed_data_path, 'rt') as f:
        try:
            result = json.load(f)
        except ValueError:
            result = []
    
    return result

# Save data as a preprocessed file.
def save_email_recipient_list(list_to_save):
    print("Saving email recipient list to: " + preprocessed_data_path)
    with open(preprocessed_data_path, 'wt') as f:
        json.dump(list_to_save, f)

In [5]:
# Does the preprocessed file exist?
recipient_info = []
if have_preprocessed_data():
    recipient_info = restore_email_recipient_list()
else:
    recipient_info = create_email_recipient_list()
    save_email_recipient_list(recipient_info)

# Sort by the number of emails sent.
recipient_info.sort(key= lambda x : x['sent-mail'], reverse=True)

# Show the top 20 most profilic emailers.
print(recipient_info[0:20])

Restoring from: C:\Users\Admin/midterm/data/enron/preprocessed_analysis3.json
[{'sent-mail': 10732, 'recipients': 68764, 'unique-recipients': 1897, 'name': 'dasovich-j'}, {'sent-mail': 10318, 'recipients': 12898, 'unique-recipients': 1581, 'name': 'kaminski-v'}, {'sent-mail': 9412, 'recipients': 13758, 'unique-recipients': 747, 'name': 'mann-k'}, {'sent-mail': 8814, 'recipients': 13452, 'unique-recipients': 1056, 'name': 'shackleton-s'}, {'sent-mail': 8246, 'recipients': 28354, 'unique-recipients': 1167, 'name': 'jones-t'}, {'sent-mail': 6938, 'recipients': 16940, 'unique-recipients': 899, 'name': 'germany-c'}, {'sent-mail': 5518, 'recipients': 7506, 'unique-recipients': 458, 'name': 'lenhart-m'}, {'sent-mail': 4818, 'recipients': 7980, 'unique-recipients': 924, 'name': 'taylor-m'}, {'sent-mail': 4704, 'recipients': 6648, 'unique-recipients': 806, 'name': 'perlingiere-d'}, {'sent-mail': 4350, 'recipients': 6490, 'unique-recipients': 844, 'name': 'nemec-g'}, {'sent-mail': 4070, 'recipie