In [1]:
# %load "baseline.py"
import random
import operator
import pandas as pd
from collections import Counter

path_to_data = "./"

##########################
# load some of the files #                           
##########################

training = pd.read_csv(path_to_data + 'training_set.csv', sep=',', header=0)

training_info = pd.read_csv(path_to_data + 'training_info.csv', sep=',', header=0)

test = pd.read_csv(path_to_data + 'test_set.csv', sep=',', header=0)

In [2]:
training.head()

Unnamed: 0,sender,mids
0,karen.buckley@enron.com,158713 158697 200301 158679 278595 298162 2002...
1,amr.ibrahim@enron.com,215241 3437 215640 3506 191790 3517 3520 3562 ...
2,andrea.ring@enron.com,270705 270706 270707 270708 270709 270710 2707...
3,sylvia.hu@enron.com,111444 111422 183084 111412 111347 110883 1105...
4,phillip.platter@enron.com,327074 327384 327385 264443 274124 274125 2741...


In [3]:
training_info.head()

Unnamed: 0,mid,date,body,recipients
0,60,2000-07-25 08:14:00,Legal has been assessing the risks of doing bl...,robert.badeer@enron.com murray.o neil@enron.co...
1,66,2000-08-03 02:56:00,Attached is a spreadsheet to estimate export f...,kim.ward@enron.com robert.badeer@enron.com mur...
2,74,2000-08-15 05:37:00,Kevin/Bob: Here is a quick rundown on the cons...,robert.badeer@enron.com john.massey@enron.com ...
3,80,2000-08-20 14:12:00,check this out and let everyone know what s up...,robert.badeer@enron.com jeff.richter@enron.com
4,83,2000-08-22 08:17:00,Further to your letter to us (addressed to Mr....,pgillman@schiffhardin.com kamarlantes@calpx.co...


In [4]:
test.head()

Unnamed: 0,sender,mids
0,karen.buckley@enron.com,298389 332383 298390 284071 366982 81773 81791...
1,amr.ibrahim@enron.com,48260 48465 50344 48268 50330 48237 189979 189...
2,andrea.ring@enron.com,366364 271168 271172 271167 271189
3,sylvia.hu@enron.com,134931 134856 233549 233517 134895 233584 3736...
4,phillip.platter@enron.com,274220 274225 274215 274223 274214 274207 2742...


In [5]:
training.size

250

In [6]:
################################
# create some handy structures #                    
################################
                            
# convert training set to dictionary
emails_ids_per_sender = {}
for index, series in training.iterrows():
    row = series.tolist()
    sender = row[0]
    ids = row[1:][0].split(' ')
    emails_ids_per_sender[sender] = ids

# save all unique sender names
all_senders = emails_ids_per_sender.keys()

# create address book with frequency information for each user
address_books = {}
i = 0

for sender, ids in emails_ids_per_sender.items():
    recs_temp = []
    for my_id in ids:
        recipients = training_info[training_info['mid']==int(my_id)]['recipients'].tolist()
        recipients = recipients[0].split(' ')
        # keep only legitimate email addresses
        recipients = [rec for rec in recipients if '@' in rec]
        recs_temp.append(recipients)
    # flatten    
    recs_temp = [elt for sublist in recs_temp for elt in sublist]
    # compute recipient counts
    rec_occ = dict(Counter(recs_temp))
    # order by frequency
    sorted_rec_occ = sorted(rec_occ.items(), key=operator.itemgetter(1), reverse = True)
    # save
    address_books[sender] = sorted_rec_occ
    
    if i % 10 == 0:
        print(i)
    i += 1
  
# save all unique recipient names    
all_recs = list(set([elt[0] for sublist in address_books.values() for elt in sublist]))

# save all unique user names 
all_users = []
all_users.extend(all_senders)
all_users.extend(all_recs)
all_users = list(set(all_users))

0
10
20
30
40
50
60
70
80
90
100
110
120


In [7]:
#############
# baselines #                           
#############

# will contain email ids, predictions for random baseline, and predictions for frequency baseline
predictions_per_sender = {}

# number of recipients to predict
k = 10

for index, row in test.iterrows():
    name_ids = row.tolist()
    sender = name_ids[0]
    # get IDs of the emails for which recipient prediction is needed
    ids_predict = name_ids[1].split(' ')
    ids_predict = [int(my_id) for my_id in ids_predict]
    random_preds = []
    freq_preds = []
    # select k most frequent recipients for the user
    k_most = [elt[0] for elt in address_books[sender][:k]]
    for id_predict in ids_predict:
        # select k users at random
        random_preds.append(random.sample(all_users, k))
        # for the frequency baseline, the predictions are always the same
        freq_preds.append(k_most)
    predictions_per_sender[sender] = [ids_predict,random_preds,freq_preds]	

#################################################
# write predictions in proper format for Kaggle #                           
#################################################

path_to_results = "./"

with open(path_to_results + 'predictions_random.txt', 'w') as my_file:
    my_file.write('mid,recipients' + '\n')
    for sender, preds in predictions_per_sender.items():
        ids = preds[0]
        random_preds = preds[1]
        for index, my_preds in enumerate(random_preds):
            my_file.write(str(ids[index]) + ',' + ' '.join(my_preds) + '\n')

with open(path_to_results + 'predictions_frequency.txt', 'w') as my_file:
    my_file.write('mid,recipients' + '\n')
    for sender, preds in predictions_per_sender.items():
        ids = preds[0]
        freq_preds = preds[2]
        for index, my_preds in enumerate(freq_preds):
            my_file.write(str(ids[index]) + ',' + ' '.join(my_preds) + '\n')

In [8]:
training_info.shape

(43613, 4)

In [15]:
training_info["body"]

0        Legal has been assessing the risks of doing bl...
1        Attached is a spreadsheet to estimate export f...
2        Kevin/Bob: Here is a quick rundown on the cons...
3        check this out and let everyone know what s up...
4        Further to your letter to us (addressed to Mr....
5        The new version of the EnronOnline website is ...
6        Check this out.  I think that we need to be si...
7        We have had some confusion recently with respe...
8        I will be traveling to Calgary on Tuesday and ...
9        Please note that the EnronOnline Phase 2 train...
10       Recently, there have been several questions re...
11       Please see the attached.  Christian, could you...
12       i would categorize things in the following man...
13       put this into the congestion redesign file if ...
14       Alaska Producers Say Gas Pipeline Would Be Too...
15       Canadian Arctic Exploration Moves Full Steam A...
16       Producers Indicate North Slope s Beaufort Rout.

In [16]:
print(training_info["body"][43584])

X-FileName: fyi-----Original Message-----From: Ingersoll, Richard Sent: Tuesday, September 25, 2001 8:16 AMTo: Belden, TimSubject: FW: Standards Announcement-----Original Message-----From: elder@wscc.com [mailto:elder@wscc.com]Sent: Monday, September 24, 2001 4:06 PMTo: WSCC Standards AnnouncementSubject: Standards AnnouncementWSCC STANDARDS CORRESPONDENTSWESTERN MARKET INTERFACE COMMITTEECOMPLIANCE PROCESS TASK FORCEOPERATIONS COMMITTEE AND   OC SUBCOMMITTEESPLANNING COORDINATION COMMITTEE AND  PCC SUBCOMMITTEESIn accordance with the WSCC "Process for Developing and Approving WSCC Standards." the "Procedures for Regional Planning Project Review and Rating Transmission Facilities" has been posted for approval on the WSCC Web site (http://www.wscc.com/files/newrppr6.pdf). Approval of the document will be requested at the October 25-26, 2001 PCC meeting.LINDA ELDERADMINISTRATIVE COORDINATORPH 801-582-0353FX 801-582-3918EM elder@wscc.com


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
import re

Help on package sklearn:

NAME
    sklearn

DESCRIPTION
    Machine learning module for Python
    
    sklearn is a Python module integrating classical machine
    learning algorithms in the tightly-knit world of scientific Python
    packages (numpy, scipy, matplotlib).
    
    It aims to provide simple and efficient solutions to learning problems
    that are accessible to everybody and reusable in various contexts:
    machine-learning as a versatile tool for science and engineering.
    
    See http://scikit-learn.org for complete documentation.

PACKAGE CONTENTS
    __check_build (package)
    _build_utils (package)
    _isotonic
    base
    calibration
    cluster (package)
    covariance (package)
    cross_decomposition (package)
    cross_validation
    datasets (package)
    decomposition (package)
    discriminant_analysis
    dummy
    ensemble (package)
    exceptions
    externals (package)
    feature_extraction (package)
    feature_selection (package)
    gaussian_

In [21]:
example = training_info.body[42]
print(example)

Note:  Good update on U.S. and Canadian supply trends. -----Original Message-----From: CERA Webmaster [mailto:webmaster@CERA.com]Sent: Wednesday, July 18, 2001 11:19 AMTo: InsightsSubject: North American Gas Productive Capacity Is Expanding - CERA Decision Brief Title: Natural Gas Productive Capacity Outlook in North America--How Fast Can It Grow? URL(s): <http://www20.cera.com/eprofile?u=35&m=2564> *********************************************************************** NORTH AMERICAN GAS PRODUCTIVE CAPACITY IS EXPANDING The effect of record gas-related drilling is becoming increasingly evident in both the United States and Canada. Key themes explored in this Decision Brief include *  the extent of the rebound in the US lower-48 and Canadian gas supply *  the outlook for US lower-48 and Canadian supply to 2005 *  underlying trends likely to constrain production growth *  the sources of new supply   *  uncertainties in the outlook through 2005: upside potential and downside threats *  p

In [23]:
from bs4 import BeautifulSoup             

print(BeautifulSoup(example).get_text())

Note:  Good update on U.S. and Canadian supply trends. -----Original Message-----From: CERA Webmaster [mailto:webmaster@CERA.com]Sent: Wednesday, July 18, 2001 11:19 AMTo: InsightsSubject: North American Gas Productive Capacity Is Expanding - CERA Decision Brief Title: Natural Gas Productive Capacity Outlook in North America--How Fast Can It Grow? URL(s):  *********************************************************************** NORTH AMERICAN GAS PRODUCTIVE CAPACITY IS EXPANDING The effect of record gas-related drilling is becoming increasingly evident in both the United States and Canada. Key themes explored in this Decision Brief include *  the extent of the rebound in the US lower-48 and Canadian gas supply *  the outlook for US lower-48 and Canadian supply to 2005 *  underlying trends likely to constrain production growth *  the sources of new supply   *  uncertainties in the outlook through 2005: upside potential and downside threats *  possible effect of the decline in gas prices 



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [34]:
import string
import re 
import itertools
import igraph
import nltk
import operator
from nltk.corpus import stopwords
# requires nltk 3.2.1
from nltk import pos_tag


def clean_text_simple(text, remove_stopwords=True, pos_filtering=True, stemming=True):
    
    punct = string.punctuation.replace('-', '')
    
    # convert to lower case
    text = text.lower()
    # remove punctuation (preserving intra-word dashes)
    text = ''.join(l for l in text if l not in punct)
    # strip extra white space
    text = re.sub(' +',' ',text)
    # strip leading and trailing white space
    text = text.strip()
    # tokenize (split based on whitespace)
    ### fill the gap ###
    tokens = text.split(' ')
    if pos_filtering == True:
        # apply POS-tagging
        tagged_tokens = pos_tag(tokens)
        # retain only nouns and adjectives
        tokens_keep = []
        for i in range(len(tagged_tokens)):
            item = tagged_tokens[i]
            if (
            item[1] == 'NN' or
            item[1] == 'NNS' or
            item[1] == 'NNP' or
            item[1] == 'NNPS' or
            item[1] == 'JJ' or
            item[1] == 'JJS' or
            item[1] == 'JJR'
            ):
                tokens_keep.append(item[0])
        tokens = tokens_keep
    if remove_stopwords:
        stpwds = stopwords.words('english')
        # remove stopwords
        tokens = [t for t in tokens if t not in stpwds]
    if stemming:
        stemmer = nltk.stem.PorterStemmer()
        # apply Porter's stemmer
        tokens_stemmed = list()
        for token in tokens:
            tokens_stemmed.append(stemmer.stem(token))
        tokens = tokens_stemmed

    return(tokens)

In [37]:
print(example)

Note:  Good update on U.S. and Canadian supply trends. -----Original Message-----From: CERA Webmaster [mailto:webmaster@CERA.com]Sent: Wednesday, July 18, 2001 11:19 AMTo: InsightsSubject: North American Gas Productive Capacity Is Expanding - CERA Decision Brief Title: Natural Gas Productive Capacity Outlook in North America--How Fast Can It Grow? URL(s): <http://www20.cera.com/eprofile?u=35&m=2564> *********************************************************************** NORTH AMERICAN GAS PRODUCTIVE CAPACITY IS EXPANDING The effect of record gas-related drilling is becoming increasingly evident in both the United States and Canada. Key themes explored in this Decision Brief include *  the extent of the rebound in the US lower-48 and Canadian gas supply *  the outlook for US lower-48 and Canadian supply to 2005 *  underlying trends likely to constrain production growth *  the sources of new supply   *  uncertainties in the outlook through 2005: upside potential and downside threats *  p

In [39]:
clean_text_simple(example, stemming=True)

['good',
 'updat',
 'canadian',
 'suppli',
 '-----origin',
 'message-----from',
 'cera',
 'webmast',
 'mailtowebmasterceracoms',
 'wednesday',
 'amto',
 'insightssubject',
 'north',
 'american',
 'ga',
 'product',
 'capac',
 'cera',
 'decis',
 'brief',
 'titl',
 'natur',
 'ga',
 'product',
 'capac',
 'outlook',
 'north',
 'america--how',
 'fast',
 'url',
 'httpwww20ceracomeprofileu35m2564',
 'north',
 'american',
 'ga',
 'product',
 'capac',
 'effect',
 'record',
 'gas-rel',
 'drill',
 'evid',
 'unit',
 'state',
 'canada',
 'key',
 'theme',
 'decis',
 'brief',
 'extent',
 'rebound',
 'lower-48',
 'canadian',
 'ga',
 'suppli',
 'outlook',
 'lower-48',
 'canadian',
 'suppli',
 'trend',
 'like',
 'product',
 'growth',
 'sourc',
 'new',
 'suppli',
 'uncertainti',
 'outlook',
 'upsid',
 'potenti',
 'downsid',
 'threat',
 'possibl',
 'effect',
 'declin',
 'ga',
 'price',
 'url',
 'complet',
 'cera',
 'decis',
 'brief',
 'page',
 'e-mail',
 'categori',
 'decis',
 'brief',
 'cera',
 'knowledg'

In [30]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [40]:
from nltk.corpus import stopwords

In [45]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '