In [2]:
#!/usr/bin/python

""" 
    Starter code for exploring the Enron dataset (emails + finances);
    loads up the dataset (pickled dict of dicts).

    The dataset has the form:
    enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }

    {features_dict} is a dictionary of features associated with that person.
    You should explore features_dict as part of the mini-project,
    but here's an example to get you started:

    enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
    
"""

import pickle

enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))




In [3]:
# number of people
len(enron_data)

146

In [4]:
# number of features
len(enron_data['SKILLING JEFFREY K'].keys())

21

In [5]:
enron_data['SKILLING JEFFREY K'].keys()

['salary',
 'to_messages',
 'deferral_payments',
 'total_payments',
 'exercised_stock_options',
 'bonus',
 'restricted_stock',
 'shared_receipt_with_poi',
 'restricted_stock_deferred',
 'total_stock_value',
 'expenses',
 'loan_advances',
 'from_messages',
 'other',
 'from_this_person_to_poi',
 'poi',
 'director_fees',
 'deferred_income',
 'long_term_incentive',
 'email_address',
 'from_poi_to_this_person']

In [6]:
enron_data['SKILLING JEFFREY K']['poi']

True

In [7]:
# count people of interest
count_poi = 0
poi_name = []
for entry in enron_data:
    if enron_data[entry]['poi'] == 1:
        count_poi += 1
        poi_name.append(entry)
print count_poi
print poi_name

18
['HANNON KEVIN P', 'COLWELL WESLEY', 'RIEKER PAULA H', 'KOPPER MICHAEL J', 'SHELBY REX', 'DELAINEY DAVID W', 'LAY KENNETH L', 'BOWEN JR RAYMOND M', 'BELDEN TIMOTHY N', 'FASTOW ANDREW S', 'CALGER CHRISTOPHER F', 'RICE KENNETH D', 'SKILLING JEFFREY K', 'YEAGER F SCOTT', 'HIRKO JOSEPH', 'KOENIG MARK E', 'CAUSEY RICHARD A', 'GLISAN JR BEN F']


In [8]:
#What is the total value of the stock belonging to James Prentice?
enron_data['PRENTICE JAMES']['total_stock_value']

1095040

In [9]:
# Number of email message from Wesley Colwell to persons of interests
enron_data['COLWELL WESLEY']['from_this_person_to_poi']
    
                          

11

In [10]:
#What’s the value of stock options exercised by Jeffrey Skilling?
enron_data['SKILLING JEFFREY K']['exercised_stock_options']

19250000

In [11]:
enron_data['SKILLING JEFFREY K']['total_payments']

8682716

In [12]:
enron_data['LAY KENNETH L']['total_payments']

103559793

In [13]:
enron_data['FASTOW ANDREW S']['total_payments']

2424083

In [14]:
# unfilled data
enron_data['COLWELL WESLEY']

{'bonus': 1200000,
 'deferral_payments': 27610,
 'deferred_income': -144062,
 'director_fees': 'NaN',
 'email_address': 'wes.colwell@enron.com',
 'exercised_stock_options': 'NaN',
 'expenses': 16514,
 'from_messages': 40,
 'from_poi_to_this_person': 240,
 'from_this_person_to_poi': 11,
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 101740,
 'poi': True,
 'restricted_stock': 698242,
 'restricted_stock_deferred': 'NaN',
 'salary': 288542,
 'shared_receipt_with_poi': 1132,
 'to_messages': 1758,
 'total_payments': 1490344,
 'total_stock_value': 698242}

In [15]:
#How many folks in this dataset have a quantified salary? 
count_salary = 0
for entry in enron_data:
    if type (enron_data[entry]['salary']) == int :
        count_salary +=1
print count_salary
        

95


In [16]:
# How many folks in this dataset have a known email address?
count_email_address = 0
for entry in enron_data:
    if enron_data[entry]['email_address'] != 'NaN' :
        count_email_address +=1
print count_email_address


111


In [17]:
#How many people in the E+F dataset (as it currently exists) have “NaN” for their total payments? 
count_total_payments_NaN = 0
for entry in enron_data:
    if enron_data[entry]['total_payments'] == 'NaN':
        count_total_payments_NaN += 1
print count_total_payments_NaN
        
#What percentage of people in the dataset as a whole is this?
print "% of people have NaN for the total payments: " + str(21*100/146) + "%"

21
% of people have NaN for the total payments: 14%


In [18]:
#How many POIs in the E+F dataset have “NaN” for their total payments? 
count_total_payments_NaN_poi = 0
for entry in enron_data:
    if enron_data[entry]['poi'] == 1:
        if enron_data[entry]['total_payments'] == 'NaN':
            count_total_payments_NaN_poi += 1
print count_total_payments_NaN_poi
# What percentage of POI’s as a whole is this?

0


In [19]:
#If you added in, say, 10 more data points which were all POI’s, and put “NaN” for the total payments for those folks, the numbers you just calculated would change.
#What is the new number of people of the dataset? What is the new number of folks with “NaN” for total payments?

10 + 21

31

In [20]:
#What is the new number of POI’s in the dataset? What percentage of them have “NaN” for their total stock value?
new_POI = 10 + 18


In [21]:
print "% of POI people have NaN for the total payments: " + str(10*100/new_POI) + "%"

% of POI people have NaN for the total payments: 35%


In [22]:
#Once the new data points are added, do you think a supervised classification algorithm might interpret “NaN” for total stock value as a clue that someone is a POI?

In [23]:
#How many data have “NaN” for their "long-term-incentives"
count_total_incentives_NaN = 0
for entry in enron_data:
    if enron_data[entry]['long_term_incentive'] == 'NaN':
        count_total_incentives_NaN += 1
print count_total_incentives_NaN
# What percentage of POI’s as a whole is this?

80


In [24]:
# find max salary
salary_list = []
for entry in enron_data:
    if type (enron_data[entry]['salary']) == int :
        salary_list.append(enron_data[entry]['salary'])
print sorted(salary_list)

[477, 6615, 63744, 76399, 80818, 85274, 94941, 96840, 130724, 158403, 162779, 170941, 174246, 182245, 184899, 187922, 192008, 197091, 199157, 201955, 206121, 210500, 210692, 211788, 211844, 213625, 213999, 216582, 221003, 222093, 224305, 229284, 231330, 231946, 236457, 239502, 239671, 240189, 243293, 247338, 248017, 248146, 248546, 249201, 250100, 251654, 257486, 259996, 261516, 261809, 261879, 262663, 262788, 263413, 265214, 267093, 267102, 269076, 271442, 272880, 273746, 274975, 275101, 278601, 278601, 288542, 288558, 288589, 304110, 304588, 309946, 314288, 317543, 329078, 330546, 339288, 349487, 357091, 365038, 365163, 365788, 370448, 374125, 404338, 415189, 420636, 428780, 440698, 492375, 510364, 655037, 1060932, 1072321, 1111258, 26704229]


In [25]:
# find max bonus
bonus_list = []
for entry in enron_data:
    if type (enron_data[entry]['bonus']) == int :
        bonus_list.append(enron_data[entry]['bonus'])
print sorted(bonus_list)

[70000, 100000, 100000, 200000, 200000, 200000, 250000, 250000, 300000, 300000, 300000, 325000, 325000, 325000, 350000, 350000, 400000, 400000, 400000, 400000, 425000, 450000, 500000, 500000, 509870, 600000, 600000, 600000, 600000, 600000, 600000, 650000, 700000, 700000, 700000, 700000, 700000, 750000, 750000, 750000, 750000, 788750, 800000, 800000, 800000, 800000, 850000, 850000, 850000, 900000, 900000, 1000000, 1000000, 1000000, 1000000, 1000000, 1000000, 1100000, 1100000, 1150000, 1200000, 1200000, 1250000, 1300000, 1350000, 1500000, 1500000, 1700000, 1750000, 2000000, 2000000, 2500000, 2600000, 3000000, 3000000, 3100000, 4175000, 5249999, 5600000, 7000000, 8000000, 97343619]


In [26]:
# find person with max salary
for entry in enron_data:
    if enron_data[entry]['salary'] == 26704229:
        print entry

TOTAL


Outlier is the 'Total'

In [31]:
for entry in enron_data:
    if enron_data[entry]['salary'] == 6615:
        print entry

GRAY RODNEY


In [34]:
print enron_data.keys()

['METTS MARK', 'BAXTER JOHN C', 'ELLIOTT STEVEN', 'CORDES WILLIAM R', 'HANNON KEVIN P', 'MORDAUNT KRISTINA M', 'MEYER ROCKFORD G', 'MCMAHON JEFFREY', 'HORTON STANLEY C', 'PIPER GREGORY F', 'HUMPHREY GENE E', 'UMANOFF ADAM S', 'BLACHMAN JEREMY M', 'SUNDE MARTIN', 'GIBBS DANA R', 'LOWRY CHARLES P', 'COLWELL WESLEY', 'MULLER MARK S', 'JACKSON CHARLENE R', 'WESTFAHL RICHARD K', 'WALTERS GARETH W', 'WALLS JR ROBERT H', 'KITCHEN LOUISE', 'CHAN RONNIE', 'BELFER ROBERT', 'SHANKMAN JEFFREY A', 'WODRASKA JOHN', 'BERGSIEKER RICHARD P', 'URQUHART JOHN A', 'BIBI PHILIPPE A', 'RIEKER PAULA H', 'WHALEY DAVID A', 'BECK SALLY W', 'HAUG DAVID L', 'ECHOLS JOHN B', 'MENDELSOHN JOHN', 'HICKERSON GARY J', 'CLINE KENNETH W', 'LEWIS RICHARD', 'HAYES ROBERT E', 'MCCARTY DANNY J', 'KOPPER MICHAEL J', 'LEFF DANIEL P', 'LAVORATO JOHN J', 'BERBERIAN DAVID', 'DETMERING TIMOTHY J', 'WAKEHAM JOHN', 'POWERS WILLIAM', 'GOLD JOSEPH', 'BANNANTINE JAMES M', 'DUNCAN JOHN H', 'SHAPIRO RICHARD S', 'SHERRIFF JOHN R', 'SHELBY 

In [35]:
enron_data['THE TRAVEL AGENCY IN THE PARK']

{'bonus': 'NaN',
 'deferral_payments': 'NaN',
 'deferred_income': 'NaN',
 'director_fees': 'NaN',
 'email_address': 'NaN',
 'exercised_stock_options': 'NaN',
 'expenses': 'NaN',
 'from_messages': 'NaN',
 'from_poi_to_this_person': 'NaN',
 'from_this_person_to_poi': 'NaN',
 'loan_advances': 'NaN',
 'long_term_incentive': 'NaN',
 'other': 362096,
 'poi': False,
 'restricted_stock': 'NaN',
 'restricted_stock_deferred': 'NaN',
 'salary': 'NaN',
 'shared_receipt_with_poi': 'NaN',
 'to_messages': 'NaN',
 'total_payments': 362096,
 'total_stock_value': 'NaN'}

In [27]:
# find person with max salary
for entry in enron_data:
    if enron_data[entry]['salary'] == 1111258:
        print entry

SKILLING JEFFREY K


In [28]:
# find person with max bonus
for entry in enron_data:
    if enron_data[entry]['bonus'] == 8000000:
        print entry

LAVORATO JOHN J


In [29]:
#!/usr/bin/python

###
### in poiFlagEmail() below, write code that returns a boolean
### indicating if a given email is from a POI
###

import sys
import reader
import poi_emails

def getToFromStrings(f):
    '''
    The imported reader.py file contains functions that we've created to help
    parse e-mails from the corpus. .getAddresses() reads in the opening lines
    of an e-mail to find the To: From: and CC: strings, while the
    .parseAddresses() line takes each string and extracts the e-mail addresses
    as a list.
    '''
    f.seek(0)
    to_string, from_string, cc_string   = reader.getAddresses(f)
    to_emails   = reader.parseAddresses( to_string )
    from_emails = reader.parseAddresses( from_string )
    cc_emails   = reader.parseAddresses( cc_string )

    return to_emails, from_emails, cc_emails


### POI flag an email

def poiFlagEmail(f):
    """ given an email file f,
        return a trio of booleans for whether that email is
        to, from, or cc'ing a poi """

    to_emails, from_emails, cc_emails = getToFromStrings(f)

    ### poi_emails.poiEmails() returns a list of all POIs' email addresses.
    poi_email_list = poi_emails.poiEmails()

    to_poi = False
    from_poi = False
    cc_poi   = False

    ### to_poi and cc_poi are related functions, which flag whether
    ### the email under inspection is addressed to a POI, or if a POI is in cc
    ### you don't have to change this code at all

    ### there can be many "to" emails, but only one "from", so the
    ### "to" processing needs to be a little more complicated
    if to_emails:
        ctr = 0
        while not to_poi and ctr < len(to_emails):
            if to_emails[ctr] in poi_email_list:
                to_poi = True
            ctr += 1
    if cc_emails:
        ctr = 0
        while not to_poi and ctr < len(cc_emails):
            if cc_emails[ctr] in poi_email_list:
                cc_poi = True
            ctr += 1


    #################################
    ######## your code below ########
    ### set from_poi to True if #####
    ### the email is from a POI #####
    #################################

    if from_emails:
        ctr = 0
        while not from_poi and ctr < len(from_emails):
            if from_emails[ctr] in poi_email_list:
                from_poi = True
            ctr += 1
    
    

    #################################
    return to_poi, from_poi, cc_poi

ImportError: No module named reader

In [None]:
import pickle
from get_data import getData

def computeFraction( poi_messages, all_messages ):
    """ given a number messages to/from POI (numerator) 
        and number of all messages to/from a person (denominator),
        return the fraction of messages to/from that person
        that are from/to a POI
   """


    ### you fill in this code, so that it returns either
    ###     the fraction of all messages to this person that come from POIs
    ###     or
    ###     the fraction of all messages from this person that are sent to POIs
    ### the same code can be used to compute either quantity

    ### beware of "NaN" when there is no known email address (and so
    ### no filled email features), and integer division!
    ### in case of poi_messages or all_messages having "NaN" value, return 0.
    fraction = 0.
    if poi_messages != "NaN" and all_messages !="NaN":
        fraction = float(poi_messages)/all_messages



    return fraction


data_dict = getData() 

submit_dict = {}
for name in data_dict:

    data_point = data_dict[name]

    print
    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
    print fraction_from_poi
    data_point["fraction_from_poi"] = fraction_from_poi


    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
    print fraction_to_poi
    submit_dict[name]={"from_poi_to_this_person":fraction_from_poi,
                       "from_this_person_to_poi":fraction_to_poi}
    data_point["fraction_to_poi"] = fraction_to_poi
    
    
#####################

def submitDict():
    return submit_dict
