In [None]:
## Model processing notebook
%pwd
#u'/media/alex/ssd/programs/Galvanize/00_project/Email'

In [None]:
import pandas as pd
import numpy as np
import json
from os import listdir
import os
import pickle
from multiprocessing import Pool

#Welcome to the Model Processing Notebook for EmailRank

##This notebook takes the processed data and trains the models used for classification. Models are then pickled for use in the flask app.

### Steps here:

1. Import data from pickle file
1. Run text processing function to get feature matrix
1. Run cross-validation on a classifier




------------------------------
## Step 1

In [100]:
# Step 1

## Read in data and perform some calculations for better understanding. 
## Data input is from the Email processing workbook

filepath = '../flask_app/data/data pickles/full_data.pkl'
data = pd.read_pickle(filepath)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13373 entries, 0 to 13372
Data columns (total 3 columns):
FROM    13373 non-null object
TO      13373 non-null object
TEXT    13373 non-null object
dtypes: object(3)

In [95]:
data = data[:10000]

In [68]:
data[['TO','TEXT']].groupby('TO').count()

Unnamed: 0_level_0,TO,TEXT
TO,Unnamed: 1_level_1,Unnamed: 2_level_1
CEO,159,159
Director,177,177
Employee,1814,1814
In House Lawyer,18,18
Manager,502,502
Managing Director,209,209
President,467,467
Trader,296,296
Vice President,1317,1317
\N,41,41


In [10]:
data[['FROM','TEXT']].groupby('FROM').count()

Unnamed: 0_level_0,FROM,TEXT
FROM,Unnamed: 1_level_1,Unnamed: 2_level_1
CEO,113,113
Director,77,77
Employee,2305,2305
In House Lawyer,4,4
Manager,650,650
Managing Director,126,126
President,161,161
Trader,177,177
Vice President,1366,1366
\N,21,21


------------------------------
## Step 2

In [96]:
## Step two is used to create the text processor pipeline. This is a custom class that includes the text processors
## pos tagger, and tfidf tokenizer trained on the input data. This class is then pickled for use in the flask app


from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize, pos_tag


## Define UPENN tagset parts of speech (used for column headers / feature importances)
PARTSOFSPEECH = ['WRB', 'VBZ', 'DT', 'NN', '.', 'NNP', ',', 'PRP', 'VBD', 'TO',
       'NNS', 'IN', 'VBG', 'CC', 'RP', 'RB', 'MD', 'VB', 'VBP', 'JJ', ':',
       'CD', 'JJR', 'WDT', 'PRP$', 'VBN', 'POS', 'WP', 'JJS', '$',
       '-NONE-', 'NNPS', 'EX']

def getPOS(x):
    ## Function used to get parts of speech features
    ## Input: tokenized email list
    ## Output: Counts of parts of speech
    ## This exists as part of the multi-processing for the pos tagger.

    parts = [i[1] for i in pos_tag(x)]

    output = []
    for part in PARTSOFSPEECH:
        output.append(np.sum(np.array(parts) == part))
    return output / (np.linalg.norm(output, ord=2)+1)



class Processor(object):
    ## Custom processor with fit, transform, and fit_transform methods
    ## use of the verbose flag will provide add'l details on train
    ## 

        
    def __init__(self):
        self.myVect = None
    
    def fit_transform(self, X, flag=True, verbose=True):
        ## Training command for the text processor. Used to fit the tokenizer and pos taggers
        ## Flag is used to identify when to fit the tokenizer and should only be called on a "fit" version 
        ## (e.g. not transform)
        
        if verbose:
            print 'Welcome to the fit_transform for the data'
        
        X = X.reset_index(drop=True)
        
        if verbose:
            print 'Base tfidf start:'
        ## TFIDF from sklearn
        if flag:
            self.myVect = TfidfVectorizer(ngram_range=(2,2))
            self.myVect.fit(X)
            if verbose:
                print 'training successful'
            
        output = pd.DataFrame(self.myVect.transform(X).toarray())
        if verbose:
            print 'transform successful, output = ', output.shape

        output.columns = self.myVect.vocabulary_
        ## Testing purposes
        output = pd.DataFrame([1]*X.shape[0])
        
        ## tokenize words
        if verbose:
            print 'Tokenizing text'
            
        tokens = X.map(word_tokenize)
        
        if verbose:
            print 'Tokenizing successful:', tokens.shape
        ## Count of words as feature
        output['word_count'] = tokens.map(len)
        
        
        ## Part of speech as feature (see getPOS for more detail)
        if verbose:
            print 'part of speech tagging'
        p = Pool(7)
        pos = pd.DataFrame(p.map(getPOS, tokens), columns=PARTSOFSPEECH)
        if verbose:
            print 'Tagging successful:', pos.shape
        output = pd.concat([output, pos], axis=1)
        
        
        return output
    
    def fit(self, *args):
        self.fit_transform(*args)
    
    def transform(self, X):
        output = self.fit_transform(X, flag=False)
        return output
    
    


------------------------------
## Step 3

In [98]:
## Step 3 is the model building and testing code. Below we perform cross validation on a model using the data
## The code is often commented to perform cross validation OR build the full model for app use based on system memory concerns

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score

myFold = KFold(len(myData.FROM), n_folds=5, shuffle=True)

accuracyScores = []

for trainIndex, testIndex in myFold:
    myModel = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    myProcessor = Processor()
    
    trainX = myProcessor.fit_transform(data.TEXT[trainIndex], verbose=True)
    trainY = myData.FROM[trainIndex]
    
    testX = myProcessor.transform(myData.TEXT[testIndex])
    testY = myData.FROM[testIndex]
    
    myModel.fit(trainX, trainY)
    predicts = myModel.predict(testX)
    
    accuracy = accuracy_score(testY, predicts)
    accuracyScores.append(accuracy)
    print 'Accuracy score: ', accuracy
    
print 'Overall Model accuracy: ', np.mean(accuracyScores)

# Full Rollout code. Commented out due to RAM concerns.

#Train Full Model
# modelTO = RandomForestClassifier(n_estimators=100, n_jobs=-1)
# modelFROM = RandomForestClassifier(n_estimators=100, n_jobs=-1)
# myProcessor = Processor()

# trainX = myProcessor.fit_transform(data.TEXT)
# trainFROM = myData.FROM
# trainTO = myData.TO

# testX = myProcessor.transform(myData.TEXT)
# testFROM = myData.FROM
# testTO = myData.TO

# modelFROM.fit(trainX, trainFROM)
# modelTO.fit(trainX, trainTO)
# predictFROM = modelFROM.predict(testX)
# predictTO = modelTO.predict(testX)

# accuracyFROM = accuracy_score(testFROM, predictFROM)
# accuracyTO = accuracy_score(testTO, predictTO)

print 'FROM Accuracy score: ', accuracyFROM
print 'TO Accuracy score: ', accuracyTO

Welcome to the fit_transform for the data
Base tfidf start:
training successful
transform successful, output =  (1600, 87653)
Welcome to the fit_transform for the data
Base tfidf start:
transform successful, output =  (400, 87653)
Accuracy score:  0.845
Welcome to the fit_transform for the data
Base tfidf start:
training successful
transform successful, output =  (1600, 88883)
Welcome to the fit_transform for the data
Base tfidf start:
transform successful, output =  (400, 88883)
Accuracy score:  0.8525
Welcome to the fit_transform for the data
Base tfidf start:
training successful
transform successful, output =  (1600, 85458)
Welcome to the fit_transform for the data
Base tfidf start:
transform successful, output =  (400, 85458)
Accuracy score:  0.8375
Welcome to the fit_transform for the data
Base tfidf start:
training successful
transform successful, output =  (1600, 90763)
Welcome to the fit_transform for the data
Base tfidf start:
transform successful, output =  (400, 90763)
Accur

In [9]:
## Save the model as a pickle for use in Flask

with open('FlaskApp/modelPickles/processor.pkl', 'w+') as f:
    pickle.dump(myProcessor, f)
    print 'Processor pickled!'
    
with open('FlaskApp/modelPickles/modelTO.pkl', 'w+') as f:
    pickle.dump(modelTO, f)
    print 'modelTO pickled!'
    
with open('FlaskApp/modelPickles/modelFROM.pkl', 'w+') as f:
    pickle.dump(modelFROM, f)
    print 'modelFROM pickled!'

Processor pickled!
modelTO pickled!
modelFROM pickled!


In [91]:
## Feature importances code

#print modelFROM.feature_importances_
myProcessor.myVect.vocabulary_

FEATURES = ['-NONE-', 'word_count', 'WRB', 'VBZ', 'DT', 'NN', '.', 'NNP', ',', 'PRP', 'VBD', 'TO',
       'NNS', 'IN', 'VBG', 'CC', 'RP', 'RB', 'MD', 'VB', 'VBP', 'JJ', ':',
       'CD', 'JJR', 'WDT', 'PRP$', 'VBN', 'POS', 'WP', 'JJS', '$',
       '-NONE-', 'NNPS', 'EX']

top_words = np.array(FEATURES)[np.argsort(modelFROM.feature_importances_)][::-1]
for feat, imp in zip(top_words, sorted(modelFROM.feature_importances_)[::-1]):
    print feat, imp, trainX.ix[51][feat]
# print '#### EOF ################' 
# top_words = np.array(FEATURES)[np.argsort(modelTO.feature_importances_)][::-1]
# for feat, imp in zip(top_words, sorted(modelTO.feature_importances_)[::-1]):
#     print feat, imp
print trainX.ix[51]

NNP 0.0542446494463 0.647077093554
word_count 0.0496102060904 223.0
IN 0.0467859161138 0.284082626438
. 0.0461399429888 0.0946942088128
: 0.0455195419086 0.17360604949
NN 0.0447952955396 0.17360604949
JJ 0.0436148027561 0.0946942088128
, 0.0427035397306 0.205170785761
CD 0.0402882115454 0.189388417626
PRP 0.0377129817318 0.315647362709
DT 0.0365736597669 0.0789118406773
NNS 0.035260368393 0.0631294725418
VBP 0.0352590417021 0.126258945084
TO 0.034795733633 0.126258945084
VB 0.0338656716652 0.299864994574
RB 0.0328418316732 0.220953153896
VBN 0.0323097905149 0.0473471044064
VBZ 0.0318777878576 0.0157823681355
CC 0.0317019166773 0.0315647362709
VBD 0.0295482066587 0.0315647362709
VBG 0.0292178693327 0.0157823681355
PRP$ 0.0282796805014 0.0789118406773
MD 0.0266524208602 0.142041313219
POS 0.017779259351 0.0157823681355
WDT 0.0154848495593 0.0
RP 0.0141433322613 0.0157823681355
WP 0.0132793542986 0.0315647362709
WRB 0.0122963168377 0.0
NNPS 0.0119460448397 0.0
JJR 0.0115949532953 0.0
-NON