In [1]:
# Load Python Library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pylab as pl
import numpy as np
import seaborn as sns
%matplotlib inline

In [2]:
# Read in data from source 
df = pd.read_csv('Speed Dating Data clean.csv')

In [3]:
# Sanity check
df.head()

Unnamed: 0,gender,match,samerace,dec_o,dec,estimated_int_corr,estimated_age_o,estimated_race_o,estimated_attr_o,estimated_sinc_o,...,estimated_like,estimated_prob,diff_age,diff_attr,diff_sinc,diff_intel,diff_fun,diff_amb,diff_shar,diff_like
0,0,0,0,0,1,0.14,27.0,2.0,6.0,8.0,...,7.0,6.0,6.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0
1,0,0,0,0,1,0.54,22.0,2.0,7.0,8.0,...,7.0,5.0,1.0,0.0,0.0,3.0,1.0,2.0,1.0,1.0
2,0,1,1,1,1,0.16,22.0,4.0,10.0,10.0,...,7.0,5.207523,1.0,5.0,2.0,1.0,2.0,5.0,3.0,3.0
3,0,1,0,1,1,0.61,23.0,2.0,7.0,8.0,...,7.0,6.0,2.0,0.0,2.0,1.0,1.0,3.0,0.0,0.0
4,0,1,0,1,1,0.21,24.0,3.0,8.0,7.0,...,6.0,6.0,3.0,3.0,1.0,2.0,1.0,3.0,1.0,2.0


### Step 3: Define train/test split

In [6]:
X = df.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y = df['match']  

In [7]:
from sklearn import tree, cross_validation, linear_model, metrics

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=1)

### Step 4: Choose model and fit the model

In [9]:
def format_model_results(sklearn_model, X_train, X_test, y_train, y_test):
    model_class_name = sklearn_model.__class__.__name__  
    
    y_test_pred   = sklearn_model.predict(X_test)
    y_train_pred  = sklearn_model.predict(X_train)

    precision_score = metrics.precision_score(y_test_pred, y_test)
    recall_score    = metrics.recall_score(y_test_pred, y_test)
    f1_score        = metrics.f1_score(y_test_pred, y_test)
    f1_score_train  = metrics.f1_score(y_train_pred, y_train)
    auc_score       = metrics.roc_auc_score(y_test_pred, y_test)
    
    return pd.DataFrame({
            "Precision": precision_score, 
            "Recall": recall_score, 
            "F1 score": f1_score, 
            "F1 score (train)": f1_score_train, 
            "AUC": auc_score
        }, index=[model_class_name])

#### Model 1: Logistic Regression

In [10]:
logreg_model = linear_model.LogisticRegression()
logreg_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
format_model_results(logreg_model, X_train, X_test, y_train, y_test)

Unnamed: 0,AUC,F1 score,F1 score (train),Precision,Recall
LogisticRegression,0.755803,0.409318,0.444597,0.302211,0.634021


#### Model 2: KNN

In [12]:
from sklearn import neighbors

knn_model = neighbors.KNeighborsClassifier(n_neighbors=3)

knn_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [13]:
format_model_results(knn_model, X_train, X_test, y_train, y_test)

Unnamed: 0,AUC,F1 score,F1 score (train),Precision,Recall
KNeighborsClassifier,0.692284,0.400593,0.667074,0.331695,0.505618


#### Model 3: Decision Trees

In [14]:
dectree_model = tree.DecisionTreeClassifier()

dectree_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [15]:
format_model_results(dectree_model, X_train, X_test, y_train, y_test)

Unnamed: 0,AUC,F1 score,F1 score (train),Precision,Recall
DecisionTreeClassifier,0.647017,0.415366,1.0,0.425061,0.406103


#### Compare the models:

In [16]:
scores_df = pd.concat([
    format_model_results(logreg_model, X_train, X_test, y_train, y_test),
    format_model_results(knn_model, X_train, X_test, y_train, y_test),
    format_model_results(dectree_model, X_train, X_test, y_train, y_test),
])

scores_df

Unnamed: 0,AUC,F1 score,F1 score (train),Precision,Recall
LogisticRegression,0.755803,0.409318,0.444597,0.302211,0.634021
KNeighborsClassifier,0.692284,0.400593,0.667074,0.331695,0.505618
DecisionTreeClassifier,0.647017,0.415366,1.0,0.425061,0.406103


### Step 5: Use logistic regression

In [27]:
from sklearn import linear_model, cross_validation, feature_selection

In [18]:
model = linear_model.LogisticRegression()
model.fit(X_train, y_train)

print 'intercept    =', model.intercept_
print 'coefficients =', model.coef_
print 'training misclassification =', model.score(X_train, y_train)
print 'testing  misclassification =', model.score(X_test, y_test)

intercept    = [-6.21242538]
coefficients = [[-0.07878418 -0.18507721  0.19344039 -0.0381096   0.02388622  0.14495816
  -0.04786939  0.01267481  0.16457815 -0.12629903  0.06294675  0.34772169
  -0.04172025 -0.00763784  0.24129345 -0.08736551 -0.00425259  0.15338811
  -0.18179223  0.10196892  0.1951396   0.17590461 -0.01103805 -0.11089588
   0.05102287 -0.05274061 -0.011684    0.01384177 -0.00295594 -0.19819655]]
training misclassification = 0.862380627558
testing  misclassification = 0.858790771679


In [19]:
zip(X, np.exp(model.coef_[0]) - 1)

[('gender', -0.075760628567643762),
 ('samerace', -0.16895988725078626),
 ('estimated_int_corr', 0.21341705774730046),
 ('estimated_age_o', -0.037392562660181317),
 ('estimated_race_o', 0.024173777120886752),
 ('estimated_attr_o', 0.15599120010199274),
 ('estimated_sinc_o', -0.046741720702443978),
 ('estimated_intel_o', 0.012755478870343673),
 ('estimated_fun_o', 0.17889569311650644),
 ('estimated_amb_o', -0.11864874421492511),
 ('estimated_shar_o', 0.064970130793692293),
 ('estimated_like_o', 0.4158381562251785),
 ('estimated_age', -0.040861939103366329),
 ('estimated_race', -0.0076087495574795883),
 ('estimated_attr', 0.27289451272354914),
 ('estimated_sinc', -0.083657895287339801),
 ('estimated_intel', -0.0042435598263382834),
 ('estimated_fun', 0.16577734263488586),
 ('estimated_amb', -0.16622544086940005),
 ('estimated_shar', 0.10734905685433782),
 ('estimated_like', 0.21548066170057512),
 ('estimated_prob', 0.19232431299146113),
 ('diff_age', -0.010977358391474401),
 ('diff_attr'

#### Interpretation of the result:
1. **gender**: being a male (gender = 1), the odd of matching for a male is 16% less than the odd of matching for a female. 
2. **same race**: in the same race (samerace yes: 1), the odd of matching for same race is 17% less than the odd of matching for different race. 
3. **interest correlation**: one unit of increase in interest shared correlation, we expect to see the odd of matching 21% higher.
4. **attractiveness rated by the partner**: one unit of increase in attractiveness rated by the other person, we expect to see matching odds of 16% higher. 
5. **fun rated by partner**: one unit of increase in fun rated by the other person, we expect to see matching odds of 18% higher. 
6. **ambitiousness rated by partner**: one unit of increase in ambitiousness rated by the other person, we expect to see matching odds of 12% higher. 
7. **like by partner**: one unit of increase in like rated by the other person, we expect to see matching odds of 41% higher. 
8. **attractiveness**: one unit of increase in attractivenss giving to the other person, we expect to see matching odds of 27% higher. 
9. **fun**: one unit of increase in fun givint to the other person, we expect to see matching odds of 16% higher. 
10. **ambitiousness**: one unit of increase in ambitiousness giving to the other person, we expect to see matching odds of 16% higher. 
11. **like**: one unit of increase in like giving to the other person, we expect to see matching odds of 21% higher. 
12. **diff in attractiveness**: one unit increase in the difference of attractivenss giving to the other person, and rating given by the other person, we expect to see matching odds of 10% lower. 
13. **diff in like**: one unit increase in the difference of like giving to the other person, and rating given by the other person, we expect to see matching odds of 18% lower. 

### Step 6: Explore subsets

In [51]:
# Create gender subsets: female vs. male

df_female = df[df['gender'] == 0]
df_male = df[df['gender'] == 1]

print 'female pop:', len(df_female)
print 'male pop:', len(df_male)

# Create race subsets:

df_race_1 = df[df['estimated_race'] == 1]
df_race_2 = df[df['estimated_race'] == 2]
df_race_3 = df[df['estimated_race'] == 3]
df_race_4 = df[df['estimated_race'] == 4]
df_race_5 = df[df['estimated_race'] == 5]
df_race_6 = df[df['estimated_race'] == 6]

print 'Black/African American pop:', len(df_race_1)
print 'European/Caucasian American pop:', len(df_race_2)
print 'Latino/Hispanic American pop:', len(df_race_3)
print 'Asian/Pacific Islander/Asian American pop:', len(df_race_4)
print 'Native American pop:', len(df_race_5)
print 'Other pop:', len(df_race_6)

# Create fields subsets: 
df_law = df[df['estimated_field_cd'] == 1]
df_med = df[df['estimated_field_cd'].isin([3,4])]
df_sciences = df[df['estimated_field_cd'].isin([2,5,9,10,17])]
df_arts = df[df['estimated_field_cd'].isin([6,7,11,13,14,15,16])]
df_business = df[df['estimated_field_cd'] == 8]
df_others = df[df['estimated_field_cd'].isin([12,18])]

female pop: 4184
male pop: 4194
Black/African American pop: 420
European/Caucasian American pop: 4790
Latino/Hispanic American pop: 664
Asian/Pacific Islander/Asian American pop: 1982
Native American pop: 0
Other pop: 522


### Step 6 - part 1: gender difference

In [31]:
X_female = df_female.drop(['gender','match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_female = df_female['match']  

from sklearn import tree, cross_validation, linear_model, metrics

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_female, y_female, test_size=0.3, random_state=1)

model_female = linear_model.LogisticRegression()
model_female.fit(X_train, y_train)

print 'intercept    =', model_female.intercept_
print 'coefficients =', model_female.coef_
print 'training misclassification =', model_female.score(X_train, y_train)
print 'testing  misclassification =', model_female.score(X_test, y_test)

intercept    = [-5.0785349]
coefficients = [[-0.10899952  0.16217835 -0.05116395 -0.13409047  0.20065993 -0.07511268
  -0.05047925  0.10827402 -0.09207402  0.0781762   0.31883536 -0.04957296
   0.10372857  0.15158748 -0.08246267  0.03905947  0.17720989 -0.16567228
   0.07365897  0.23011871  0.1664218  -0.01077348 -0.11246083 -0.00123261
  -0.0320549  -0.04783438  0.02881955 -0.00500518 -0.12700898]]
training misclassification = 0.863729508197
testing  misclassification = 0.874203821656


In [30]:
X_male = df_male.drop(['gender','match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_male = df_male['match']  

from sklearn import tree, cross_validation, linear_model, metrics

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_male, y_male, test_size=0.3, random_state=1)

model_male = linear_model.LogisticRegression()
model_male.fit(X_train, y_train)

print 'intercept    =', model_male.intercept_
print 'coefficients =', model_male.coef_
print 'training misclassification =', model_male.score(X_train, y_train)
print 'testing  misclassification =', model_male.score(X_test, y_test)

intercept    = [-4.78296842]
coefficients = [[-0.18029904  0.0328874  -0.07002437  0.09416878  0.10294522 -0.08584413
   0.02932537  0.23769061 -0.22419806  0.13037189  0.38354753 -0.06133089
  -0.08564429  0.23572476 -0.06043633 -0.02237994  0.10953219 -0.12993037
  -0.02090938  0.28481903  0.19107609 -0.01571744 -0.15767801  0.0554861
  -0.05116344 -0.01513735  0.05731804 -0.07845726 -0.18574216]]
training misclassification = 0.868824531516
testing  misclassification = 0.85941223193


In [44]:
# To print out both gender side by side and compare the difference. 
zip(X_female, np.exp(model_female.coef_[0]) - 1,np.exp(model_male.coef_[0]) - 1)

[('samerace', -0.10326915205105891, -0.16497952733686028),
 ('estimated_int_corr', 0.17606997550662706, 0.03343417021397066),
 ('estimated_age_o', -0.049877119122539559, -0.06762890342648753),
 ('estimated_race_o', -0.12548905640570163, 0.098745178972957204),
 ('estimated_attr_o', 0.22220906630061332, 0.1084306869020244),
 ('estimated_sinc_o', -0.072361045562049142, -0.082262732519943427),
 ('estimated_intel_o', -0.049226346154602418, 0.029759597823639128),
 ('estimated_fun_o', 0.11435306270753687, 0.26831672229133341),
 ('estimated_amb_o', -0.087962357937401148, -0.20084316427402704),
 ('estimated_shar_o', 0.081313165073613636, 0.13925197933887135),
 ('estimated_like_o', 0.37552483922462554, 0.46748129858827614),
 ('estimated_age', -0.048364273613945241, -0.059488017480888389),
 ('estimated_race', 0.10929931842287677, -0.082079312262565485),
 ('estimated_attr', 0.16368009504338876, 0.26582585068208742),
 ('estimated_sinc', -0.079154183152365887, -0.058646294330349957),
 ('estimated_in

### Step 6 - part 2: race difference

In [45]:
X_race_1 = df_race_1.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_race_1 = df_race_1['match']  

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_race_1, y_race_1, test_size=0.3, random_state=1)

model_race_1 = linear_model.LogisticRegression()
model_race_1.fit(X_train, y_train)

print 'training misclassification =', model_race_1.score(X_train, y_train)
print 'testing  misclassification =', model_race_1.score(X_test, y_test)

training misclassification = 0.853741496599
testing  misclassification = 0.769841269841


In [46]:
X_race_2 = df_race_2.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_race_2 = df_race_2['match']  

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_race_2, y_race_2, test_size=0.3, random_state=1)

model_race_2 = linear_model.LogisticRegression()
model_race_2.fit(X_train, y_train)

print 'training misclassification =', model_race_2.score(X_train, y_train)
print 'testing  misclassification =', model_race_2.score(X_test, y_test)

training misclassification = 0.861318222487
testing  misclassification = 0.865692414753


In [47]:
X_race_3 = df_race_3.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_race_3 = df_race_3['match']  

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_race_3, y_race_3, test_size=0.3, random_state=1)

model_race_3 = linear_model.LogisticRegression()
model_race_3.fit(X_train, y_train)

print 'training misclassification =', model_race_3.score(X_train, y_train)
print 'testing  misclassification =', model_race_3.score(X_test, y_test)

training misclassification = 0.859913793103
testing  misclassification = 0.85


In [48]:
X_race_4 = df_race_4.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_race_4 = df_race_4['match']  

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_race_4, y_race_4, test_size=0.3, random_state=1)

model_race_4 = linear_model.LogisticRegression()
model_race_4.fit(X_train, y_train)

print 'training misclassification =', model_race_4.score(X_train, y_train)
print 'testing  misclassification =', model_race_4.score(X_test, y_test)

training misclassification = 0.876712328767
testing  misclassification = 0.887394957983


In [49]:
pd.DataFrame(data = zip(X_race_1, np.exp(model_race_1.coef_[0]) - 1, np.exp(model_race_2.coef_[0]) - 1,\
                 np.exp(model_race_3.coef_[0]) - 1, np.exp(model_race_4.coef_[0]) - 1),\
             columns = ['features','race 1', 'race 2','race 3', 'race 4'])

Unnamed: 0,features,race 1,race 2,race 3,race 4
0,gender,-0.551277,0.403963,-0.414884,-0.304255
1,samerace,0.821243,-0.06401,0.056114,-0.020669
2,estimated_int_corr,0.047282,-0.015543,1.05421,0.503466
3,estimated_age_o,-0.051826,-0.031428,-0.013957,-0.026597
4,estimated_race_o,0.239823,-0.043482,0.087289,0.076606
5,estimated_attr_o,0.364997,0.15596,0.129879,0.221362
6,estimated_sinc_o,-0.118682,-0.023572,0.024284,-0.055312
7,estimated_intel_o,0.086906,-0.030501,0.018609,0.036123
8,estimated_fun_o,0.158967,0.249038,0.095039,0.116614
9,estimated_amb_o,-0.326355,-0.157391,-0.104324,-0.073125


### Step 6 - part 3: field difference

In [50]:
X_law = df_law.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_law = df_law['match']  

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_law, y_law, test_size=0.3, random_state=1)

model_law = linear_model.LogisticRegression()
model_law.fit(X_train, y_train)

print 'training misclassification =', model_law.score(X_train, y_train)
print 'testing  misclassification =', model_law.score(X_test, y_test)

training misclassification = 0.840860215054
testing  misclassification = 0.84


In [52]:
X_med = df_med.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_med = df_med['match']  

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_med, y_med, test_size=0.3, random_state=1)

model_med = linear_model.LogisticRegression()
model_med.fit(X_train, y_train)

print 'training misclassification =', model_med.score(X_train, y_train)
print 'testing  misclassification =', model_med.score(X_test, y_test)

training misclassification = 0.848381601363
testing  misclassification = 0.801587301587


In [53]:
X_sciences = df_sciences.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_sciences = df_sciences['match']  

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_sciences, y_sciences, test_size=0.3, random_state=1)

model_sciences = linear_model.LogisticRegression()
model_sciences.fit(X_train, y_train)

print 'training misclassification =', model_sciences.score(X_train, y_train)
print 'testing  misclassification =', model_sciences.score(X_test, y_test)

training misclassification = 0.880423280423
testing  misclassification = 0.859259259259


In [54]:
X_arts = df_arts.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_arts = df_arts['match']  

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_arts, y_arts, test_size=0.3, random_state=1)

model_arts = linear_model.LogisticRegression()
model_arts.fit(X_train, y_train)

print 'training misclassification =', model_arts.score(X_train, y_train)
print 'testing  misclassification =', model_arts.score(X_test, y_test)

training misclassification = 0.87593728698
testing  misclassification = 0.850556438792


In [55]:
X_business = df_business.drop(['match', 'dec_o', 'dec','estimated_field_cd'], axis=1)
y_business = df_business['match']  

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_business, y_business, test_size=0.3, random_state=1)

model_business = linear_model.LogisticRegression()
model_business.fit(X_train, y_train)

print 'training misclassification =', model_business.score(X_train, y_train)
print 'testing  misclassification =', model_business.score(X_test, y_test)

training misclassification = 0.859686609687
testing  misclassification = 0.845771144279


In [56]:
pd.DataFrame(data = zip(X_law, np.exp(model_law.coef_[0]) - 1, np.exp(model_med.coef_[0]) - 1,\
                 np.exp(model_sciences.coef_[0]) - 1, np.exp(model_arts.coef_[0]) - 1, np.exp(model_business.coef_[0]) - 1),\
              columns = ['Features','Law', 'Med','Sciences', 'Arts', 'Business'])

Unnamed: 0,Features,Law,Med,Sciences,Arts,Business
0,gender,-0.289848,0.587341,0.052309,0.184258,-0.220742
1,samerace,0.153788,-0.281947,-0.233995,-0.084558,-0.218372
2,estimated_int_corr,-0.43715,0.689571,0.06701,0.088649,0.2529
3,estimated_age_o,0.008292,-0.114563,-0.102517,-0.075655,-0.043187
4,estimated_race_o,0.072676,-0.191698,0.013519,-0.077336,-0.093131
5,estimated_attr_o,0.046358,0.461128,0.198726,0.125426,0.192002
6,estimated_sinc_o,-0.033576,-0.014044,0.001024,-0.174272,-0.06838
7,estimated_intel_o,0.091331,-0.022743,-0.010637,0.136776,-0.080983
8,estimated_fun_o,0.068303,0.189198,0.131107,0.229256,0.141177
9,estimated_amb_o,-0.15588,-0.228999,-0.057051,-0.351797,-0.139578
