################################################################################################################################
#                     Mid Term: Author-Vaishali Lambe, NUID-001286444                         #
################################################################################################################################

# Question 1:

**Analysis 2**
#### Who were the people with the most emails?
################################################################################################################################

In [1]:
# Import the modules we need.
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import datetime
from glob import glob
import json
import nltk
from nltk import FreqDist
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
import os
import re
import requests
import string
import sys
from timeit import default_timer as timer

# Path to the raw data.
raw_data_path = os.path.expanduser("~/midterm/data/enron/maildir")
print(raw_data_path)

# Path to the preprocessed data file (may or may not exist).
preprocessed_data_path = os.path.expanduser("~/midterm/data/enron/preprocessed_analysis2.json")
print(preprocessed_data_path)

# Check to see if the file is already there.
def have_preprocessed_data():
    return os.path.isfile(preprocessed_data_path)

print("Preprocessed data? " + str(have_preprocessed_data()))
    
# Make sure the stopwords corpus is available.
nltk.download('stopwords')

C:\Users\Admin/midterm/data/enron/maildir
C:\Users\Admin/midterm/data/enron/preprocessed_analysis2.json
Preprocessed data? True
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Get the home directories in the Enron mail dump.
def get_enron_home_dirs():
    
    # Save the working directory (for later restoration).
    saved_path = os.getcwd()
    print(saved_path)

    # Get the paths to the data files.
    # Use the contents of the directory as a way to get the usernames.
    os.chdir(raw_data_path)
    result = glob('*')

    # Restore the working directory.
    os.chdir(saved_path)
    print(os.getcwd())

    print("found " + str(len(result)) + " home directories:")
    print(result[0:50])

    return result

In [3]:
# How many items of email does each user have in their mail directory?

# Create the mapping from a user to the number of email items they have.
def create_user_email_list():
    result = []

    enron_home_dirs = get_enron_home_dirs()
    for u in enron_home_dirs:
        home_path = os.path.join(raw_data_path, u)
        #print(home_path)
    
        # Save the current working directory.
        saved_wd = os.getcwd()
    
        os.chdir(home_path)
        user_emails = glob("**/**")
        result.append({'user': u, 'emails': len(user_emails)})
    
        #print("{user} had {count} emails".format(user = u, count = len(user_emails)))
    
        # Restore the current working directory.
        os.chdir(saved_wd)
    
    return result
    
# Restore from a preprocessed file.
def restore_user_email_list():
    print("Restoring from: " + preprocessed_data_path)

    result = []
    with open(preprocessed_data_path, 'rt') as f:
        try:
            result = json.load(f)
        except ValueError:
            result = []
    
    return result

# Save data as a preprocessed file.
def save_user_email_list(list_to_save):
    print("Saving user email list to: " + preprocessed_data_path)
    with open(preprocessed_data_path, 'wt') as f:
        json.dump(list_to_save, f)

In [4]:
# Does the preprocessed file exist?
user_email_list = []
if have_preprocessed_data():
    user_email_list = restore_user_email_list()
else:
    user_email_list = create_user_email_list()
    save_user_email_list(user_email_list)

# Sort in descending order so the most prolific emailers are at the top.
user_email_list.sort(key=lambda x: x['emails'], reverse=True)
print(user_email_list[0:20])

Restoring from: C:\Users\Admin/midterm/data/enron/preprocessed_analysis2.json
[{'user': 'dasovich-j', 'emails': 56457}, {'user': 'kaminski-v', 'emails': 53355}, {'user': 'mann-k', 'emails': 46762}, {'user': 'jones-t', 'emails': 39900}, {'user': 'shackleton-s', 'emails': 37374}, {'user': 'kean-s', 'emails': 32764}, {'user': 'farmer-d', 'emails': 25850}, {'user': 'taylor-m', 'emails': 24230}, {'user': 'germany-c', 'emails': 23941}, {'user': 'beck-s', 'emails': 23261}, {'user': 'nemec-g', 'emails': 21310}, {'user': 'symes-k', 'emails': 19485}, {'user': 'scott-s', 'emails': 16044}, {'user': 'rogers-b', 'emails': 16018}, {'user': 'bass-e', 'emails': 15633}, {'user': 'sanders-r', 'emails': 14531}, {'user': 'campbell-l', 'emails': 12979}, {'user': 'guzman-m', 'emails': 12108}, {'user': 'shapiro-r', 'emails': 12100}, {'user': 'lay-k', 'emails': 11874}]


In [5]:
# Pretty-print the 50 users with the most emails.
print("\nThe 50 users with the most emails:")
print("===================================================================================")
for i in user_email_list[0:50]:
    print("{name} had {count} emails".format(name=i['user'], count=i['emails']))


The 50 users with the most emails:
dasovich-j had 56457 emails
kaminski-v had 53355 emails
mann-k had 46762 emails
jones-t had 39900 emails
shackleton-s had 37374 emails
kean-s had 32764 emails
farmer-d had 25850 emails
taylor-m had 24230 emails
germany-c had 23941 emails
beck-s had 23261 emails
nemec-g had 21310 emails
symes-k had 19485 emails
scott-s had 16044 emails
rogers-b had 16018 emails
bass-e had 15633 emails
sanders-r had 14531 emails
campbell-l had 12979 emails
guzman-m had 12108 emails
shapiro-r had 12100 emails
lay-k had 11874 emails
lenhart-m had 11840 emails
lokay-m had 11134 emails
haedicke-m had 10481 emails
sager-e had 10385 emails
love-p had 10004 emails
arnold-j had 9796 emails
fossum-d had 9592 emails
perlingiere-d had 9556 emails
lavorato-j had 9370 emails
mcconnell-m had 8895 emails
giron-d had 8440 emails
skilling-j had 8259 emails
shankman-j had 7679 emails
hain-m had 7640 emails
delainey-d had 7132 emails
williams-w3 had 6880 emails
whalley-l had 6670 emails
