# Analysis 2

#### Who were the people with the most emails?

In [None]:
# Import the modules we need.
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import datetime
from glob import glob
import json
import nltk
from nltk import FreqDist
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
import os
import re
import requests
import string
import sys
from timeit import default_timer as timer

# Path to the raw data.
raw_data_path = os.path.expanduser("~/midterm/data/enron/maildir")
print(raw_data_path)

# Path to the preprocessed data file (may or may not exist).
preprocessed_data_path = os.path.expanduser("~/midterm/data/enron/preprocessed_analysis2.json")
print(preprocessed_data_path)

# Check to see if the file is already there.
def have_preprocessed_data():
    return os.path.isfile(preprocessed_data_path)

print("Preprocessed data? " + str(have_preprocessed_data()))
    
# Make sure the stopwords corpus is available.
nltk.download('stopwords')

In [None]:
# Get the home directories in the Enron mail dump.
def get_enron_home_dirs():
    
    # Save the working directory (for later restoration).
    saved_path = os.getcwd()
    print(saved_path)

    # Get the paths to the data files.
    # Use the contents of the directory as a way to get the usernames.
    os.chdir(raw_data_path)
    result = glob('*')

    # Restore the working directory.
    os.chdir(saved_path)
    print(os.getcwd())

    print("found " + str(len(result)) + " home directories:")
    print(result[0:50])

    return result

In [None]:
# How many items of email does each user have in their mail directory?

# Create the mapping from a user to the number of email items they have.
def create_user_email_list():
    result = []

    enron_home_dirs = get_enron_home_dirs()
    for u in enron_home_dirs:
        home_path = os.path.join(raw_data_path, u)
        #print(home_path)
    
        # Save the current working directory.
        saved_wd = os.getcwd()
    
        os.chdir(home_path)
        user_emails = glob("**/**")
        result.append({'user': u, 'emails': len(user_emails)})
    
        #print("{user} had {count} emails".format(user = u, count = len(user_emails)))
    
        # Restore the current working directory.
        os.chdir(saved_wd)
    
    return result
    
# Restore from a preprocessed file.
def restore_user_email_list():
    print("Restoring from: " + preprocessed_data_path)

    result = []
    with open(preprocessed_data_path, 'rt') as f:
        try:
            result = json.load(f)
        except ValueError:
            result = []
    
    return result

# Save data as a preprocessed file.
def save_user_email_list(list_to_save):
    print("Saving user email list to: " + preprocessed_data_path)
    with open(preprocessed_data_path, 'wt') as f:
        json.dump(list_to_save, f)

In [None]:
# Does the preprocessed file exist?
user_email_list = []
if have_preprocessed_data():
    user_email_list = restore_user_email_list()
else:
    user_email_list = create_user_email_list()
    save_user_email_list(user_email_list)

# Sort in descending order so the most prolific emailers are at the top.
user_email_list.sort(key=lambda x: x['emails'], reverse=True)
print(user_email_list[0:20])

In [None]:
# Pretty-print the 50 users with the most emails.
for i in user_email_list[0:50]:
    print("{name} had {count} emails".format(name=i['user'], count=i['emails']))