In [135]:
from __future__ import division
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, auc
from sklearn import cross_validation
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline

import re
from HTMLParser import HTMLParser
import datetime
import cPickle as pickle

In [2]:
# File with gender included
sample10_file = '/Users/lekha/galvanize/capstone/projectRiley/data/sample10/out.txt'
sample1000_file = '/Users/lekha/galvanize/capstone/projectRiley/data/cleandatagender1000.txt'
all_file = '/Users/lekha/galvanize/capstone/projectRiley/data/cleandatagenderall.txt'

In [189]:
df_all = pd.read_csv(all_file, sep="|")

In [191]:
df_all.head()

Unnamed: 0.1,Unnamed: 0,full_name,html,summary,counter,first_name,gender
0,1,shawn douglas,./00006.html\n,i am interested in inventing new methods to co...,1,shawn,male
1,2,regina nunn,./05111108.html\n,missing,1,regina,female
2,3,michael mayes,./120394.html\n,a detail and results oriented professional wit...,1,michael,male
3,5,jason obrien,./17obrien.html\n,accomplished energetic sales professional with...,1,jason,male
4,6,kevin kim,./1800sushi.html\n,1800sushicom is the new online platform for or...,1,kevin,male


In [192]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 995 entries, 0 to 994
Data columns (total 9 columns):
full_name       995 non-null object
html            995 non-null object
summary         995 non-null object
first_name      995 non-null object
gender          995 non-null object
counter         995 non-null float64
summ_missing    995 non-null int64
summ_words      995 non-null int64
y               995 non-null int64
dtypes: float64(1), int64(3), object(5)
memory usage: 77.7+ KB


In [194]:
df_all = df_all[['full_name', 'html','summary', 'first_name', 'gender', 'counter']]

In [195]:
df_all['class'] = np.ones(len(df_all))

In [196]:
df_all['class'] = df_all['gender'].apply(lambda x: 0 if x == 'female' else 1)

In [197]:
df_all.head(20)

Unnamed: 0,full_name,html,summary,first_name,gender,counter,class
0,shawn douglas,./00006.html\n,i am interested in inventing new methods to co...,shawn,male,1,1
1,regina nunn,./05111108.html\n,missing,regina,female,1,0
2,michael mayes,./120394.html\n,a detail and results oriented professional wit...,michael,male,1,1
3,jason obrien,./17obrien.html\n,accomplished energetic sales professional with...,jason,male,1,1
4,kevin kim,./1800sushi.html\n,1800sushicom is the new online platform for or...,kevin,male,1,1
5,amy chu,./1amychu.html\n,missing,amy,female,1,0
6,betty evans,./1bettyevans.html\n,rj evans associates inc a retained executive ...,betty,female,1,0
7,jason cheng,./1jasoncheng.html\n,studied economics business administration and ...,jason,male,1,1
8,jonathan nelson,./1jonnelson.html\n,i make tools mostly software lowfriction highv...,jonathan,male,1,1
9,kelly sullivan,./1kellysullivan.html\n,missing,kelly,female,1,0


In [199]:
def f(x):
    if x == 'missing':
        return 1
    else:
        return 0

# Add feature for missing summary
df_all['summ_missing'] = df_all['summary'].apply(lambda x: f(x))

In [200]:
df_all.head()

Unnamed: 0,full_name,html,summary,first_name,gender,counter,class,summ_missing
0,shawn douglas,./00006.html\n,i am interested in inventing new methods to co...,shawn,male,1,1,0
1,regina nunn,./05111108.html\n,missing,regina,female,1,0,1
2,michael mayes,./120394.html\n,a detail and results oriented professional wit...,michael,male,1,1,0
3,jason obrien,./17obrien.html\n,accomplished energetic sales professional with...,jason,male,1,1,0
4,kevin kim,./1800sushi.html\n,1800sushicom is the new online platform for or...,kevin,male,1,1,0


In [205]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18047 entries, 0 to 18046
Data columns (total 8 columns):
full_name       18047 non-null object
html            18047 non-null object
summary         18040 non-null object
first_name      18043 non-null object
gender          18047 non-null object
counter         18047 non-null float64
class           18047 non-null int64
summ_missing    18047 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 1.2+ MB


In [207]:
df_all['summary']

0        i am interested in inventing new methods to co...
1                                                  missing
2        a detail and results oriented professional wit...
3        accomplished energetic sales professional with...
4        1800sushicom is the new online platform for or...
5                                                  missing
6        rj evans  associates inc a retained executive ...
7        studied economics business administration and ...
8        i make tools mostly software lowfriction highv...
9                                                  missing
10                                                 missing
11                                                 missing
12       public relations and content marketing profess...
13       landscape design professional with 13 years of...
14       i specialize in senior level design management...
15       worked in sales and business management since ...
16       overview nearly 30 years experience in enginee.

In [203]:
# add feature for num of words in the summary
def lenx(mystr):
    return len(mystr.split())

In [209]:
df_all['summ_words'] = df_all['summary'].apply(lambda x: lenx(str(x)))

In [210]:
df_all.head()

Unnamed: 0,full_name,html,summary,first_name,gender,counter,class,summ_missing,summ_words
0,shawn douglas,./00006.html\n,i am interested in inventing new methods to co...,shawn,male,1,1,0,25
1,regina nunn,./05111108.html\n,missing,regina,female,1,0,1,1
2,michael mayes,./120394.html\n,a detail and results oriented professional wit...,michael,male,1,1,0,76
3,jason obrien,./17obrien.html\n,accomplished energetic sales professional with...,jason,male,1,1,0,55
4,kevin kim,./1800sushi.html\n,1800sushicom is the new online platform for or...,kevin,male,1,1,0,173


In [212]:
df_all.gender.value_counts()

male      11071
female     6976
Name: gender, dtype: int64

In [213]:
df_all.summ_missing.value_counts()

0    12185
1     5862
Name: summ_missing, dtype: int64

In [214]:
# create dfs for females and males
# subset the data by taking the most common names that can be more or less guaranteed to be the gender they claim to be
# high level analysis on the number of words used by males and females
females = df_all[df_all['gender'] == 'female']
males = df_all[df_all['gender'] == 'male']

In [216]:
df_all.groupby('gender').agg({'summ_words':np.mean})

Unnamed: 0_level_0,summ_words
gender,Unnamed: 1_level_1
female,66.402236
male,76.896125


In [217]:
df_all.groupby('gender').agg({'summ_missing':sum})

Unnamed: 0_level_0,summ_missing
gender,Unnamed: 1_level_1
female,2511
male,3351


In [218]:
# percentage summary missing
female_summaries = 2511/6976.
female_summaries

0.3599483944954128

In [219]:
male_summaries = 3351/11071
male_summaries

0.30268268449101254

In [220]:
# unique names in females and males
print females['first_name'].nunique()
print males['first_name'].nunique()


1851
2393


In [221]:
summary_df2 = df_all[df_all['summ_missing'] == 0]

In [222]:
type(summary_df2)

pandas.core.frame.DataFrame

In [223]:
summary_df2.head()

Unnamed: 0,full_name,html,summary,first_name,gender,counter,class,summ_missing,summ_words
0,shawn douglas,./00006.html\n,i am interested in inventing new methods to co...,shawn,male,1,1,0,25
2,michael mayes,./120394.html\n,a detail and results oriented professional wit...,michael,male,1,1,0,76
3,jason obrien,./17obrien.html\n,accomplished energetic sales professional with...,jason,male,1,1,0,55
4,kevin kim,./1800sushi.html\n,1800sushicom is the new online platform for or...,kevin,male,1,1,0,173
6,betty evans,./1bettyevans.html\n,rj evans associates inc a retained executive ...,betty,female,1,0,0,101


In [224]:
summary_df2.gender.value_counts()

male      7720
female    4465
Name: gender, dtype: int64

In [225]:
def get_vocab(my_str):
    words = my_str.split()
    words = [w for w in words if not w in stopwords]
    return words


In [228]:
vocab = summary_df2['summary'].apply(lambda x:get_vocab(str(x)))

In [229]:
all_vocab = []
for row in vocab:
    for word in row:
        all_vocab.append(word)



In [230]:
vocab_set = set(all_vocab)

In [231]:
len(vocab_set)

76964

In [232]:
summary_df2.gender.value_counts()

male      7720
female    4465
Name: gender, dtype: int64

In [None]:
# a basic vocabulary look at males and females

In [235]:
f_vocab = females['summary'].apply(lambda x:get_vocab(str(x)))

In [236]:
f_all_vocab = []
for row in f_vocab:
    for word in row:
        f_all_vocab.append(word)
        
f_vocab_set = set(f_all_vocab)
len(f_vocab_set)

35370

In [238]:
m_vocab = males['summary'].apply(lambda x:get_vocab(str(x)))

In [239]:
m_all_vocab = []
for row in m_vocab:
    for word in row:
        m_all_vocab.append(word)
        
m_vocab_set = set(m_all_vocab)
len(m_vocab_set)

57951

In [240]:
def intersect(a, b):
    return list(set(a) & set(b))

In [241]:
common_words = intersect(f_vocab_set, m_vocab_set)

In [242]:
len(common_words)

16357

In [253]:
summary_df2['summary'] = summary_df2['summary'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [243]:
# A basic prediction algorithm to predict gender using summary using RF

In [247]:

X_train, X_test, y_train, y_test = train_test_split(summary_df2['summary'], summary_df2['class'], test_size=0.3, random_state=0)

In [248]:
type(X_train)

pandas.core.series.Series

In [250]:
temp = np.array(y_test)

len(temp[(temp == 0)])

1341

In [251]:
print "Creating the bag of words...\n"
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(X_train)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



ValueError: np.nan is an invalid document, expected byte or unicode string.

In [179]:
train_data_features.shape

(540, 5000)

In [180]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print vocab

[u'10', u'100', u'1000', u'10000', u'10th', u'11', u'12', u'1200', u'125', u'13', u'14', u'15', u'150', u'16', u'17', u'1760', u'18', u'1877', u'19', u'1983', u'1985', u'1987', u'1989', u'1990', u'1991', u'1994', u'1995', u'1996', u'1998', u'1999', u'20', u'200', u'2000', u'20000', u'2001', u'2002', u'2003', u'2004', u'2005', u'2006', u'2007', u'2008', u'2009', u'2010', u'2011', u'2012', u'2013', u'2013the', u'2014', u'2014completed', u'2015', u'22', u'23', u'25', u'26', u'2d', u'30', u'300', u'35', u'360', u'3d', u'3rd', u'40', u'45', u'46', u'50', u'500', u'501c3', u'5yrs', u'60', u'70', u'700', u'80', u'ab', u'aba', u'abilities', u'ability', u'able', u'about', u'above', u'abroad', u'abuse', u'academia', u'academic', u'academy', u'accelerate', u'accelerator', u'accenture', u'accepted', u'access', u'accessibility', u'accomplished', u'accomplishment', u'according', u'account', u'accountability', u'accountable', u'accountant', u'accountants', u'accounting', u'accounts', u'accuracy', u'a

In [181]:
print "Training the random forest..."

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit(train_data_features, y_train)

Training the random forest...


In [182]:
# Testing

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
yhat = forest.predict(test_data_features)

probX = forest.predict_proba(test_data_features)


In [183]:
zip(yhat, y_test)
print probX

[[ 0.37  0.63]
 [ 0.42  0.58]
 [ 0.42  0.58]
 [ 0.28  0.72]
 [ 0.2   0.8 ]
 [ 0.35  0.65]
 [ 0.47  0.53]
 [ 0.52  0.48]
 [ 0.24  0.76]
 [ 0.25  0.75]
 [ 0.52  0.48]
 [ 0.35  0.65]
 [ 0.25  0.75]
 [ 0.42  0.58]
 [ 0.2   0.8 ]
 [ 0.23  0.77]
 [ 0.49  0.51]
 [ 0.39  0.61]
 [ 0.42  0.58]
 [ 0.24  0.76]
 [ 0.44  0.56]
 [ 0.48  0.52]
 [ 0.51  0.49]
 [ 0.17  0.83]
 [ 0.21  0.79]
 [ 0.29  0.71]
 [ 0.46  0.54]
 [ 0.12  0.88]
 [ 0.26  0.74]
 [ 0.51  0.49]
 [ 0.26  0.74]
 [ 0.5   0.5 ]
 [ 0.19  0.81]
 [ 0.26  0.74]
 [ 0.31  0.69]
 [ 0.27  0.73]
 [ 0.2   0.8 ]
 [ 0.2   0.8 ]
 [ 0.21  0.79]
 [ 0.24  0.76]
 [ 0.35  0.65]
 [ 0.22  0.78]
 [ 0.47  0.53]
 [ 0.18  0.82]
 [ 0.32  0.68]
 [ 0.34  0.66]
 [ 0.43  0.57]
 [ 0.59  0.41]
 [ 0.35  0.65]
 [ 0.42  0.58]
 [ 0.48  0.52]
 [ 0.27  0.73]
 [ 0.34  0.66]
 [ 0.41  0.59]
 [ 0.29  0.71]
 [ 0.13  0.87]
 [ 0.25  0.75]
 [ 0.16  0.84]
 [ 0.25  0.75]
 [ 0.21  0.79]
 [ 0.36  0.64]
 [ 0.15  0.85]
 [ 0.32  0.68]
 [ 0.42  0.58]
 [ 0.14  0.86]
 [ 0.22  0.78]
 [ 0.3   0

In [184]:
acc = forest.score(test_data_features, y_test)

In [185]:
acc

0.69117647058823528

In [133]:
zip(yhat, y_test)

[('male', 'male'),
 ('male', 'female'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('female', 'male'),
 ('male', 'male'),
 ('male', 'female'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'female'),
 ('male', 'male'),
 ('male', 'female'),
 ('female', 'female'),
 ('female', 'female'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'female'),
 ('male', 'male'),
 ('male', 'female'),
 ('male', 'female'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'male'),
 ('male', 'female'),
 ('female', 'female'),
 ('female', 'female'),
 ('male', 'female'),
 ('male', 'female'),
 ('female', 

In [188]:
print "Precision Score: {0}".format(precision_score(y_test, yhat))
print "Recall Score: {0}".format(recall_score(y_test, yhat))
print "AUC Score: {0}".format(roc_auc_score(y_test, yhat))
print "Model Score:{0}".format(forest.score(test_data_features, y_test))

Precision Score: 0.696
Recall Score: 0.956043956044
AUC Score: 0.5557997558
Model Score:0.691176470588


In [138]:
fpr, tpr, thresholds = roc_curve(y_test, yhat, pos_label=2)
auc(fpr, tpr)

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [137]:
yhat

array(['male', 'male', 'male', 'male', 'male', 'male', 'female', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'female', 'female', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'female', 'female',
       'male', 'male', 'female', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'female',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'female', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male'