# DS-SF-30 | Assignment 12: Pregnancy Prediction

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

from sklearn import linear_model, cross_validation, preprocessing, neighbors, grid_search


import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')



In [2]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-12-walget-train.csv'))

In [3]:
df

Unnamed: 0,AccountHolderImpliedGender,AccountHolderAddress,RecentlyPurchasedPregnancyTest,RecentlyPurchasedBirthControl,RecentlyPurchasedFeminineHygieneProducts,...,RecentlyPurchasedSmokingCessationProducts,PurchasedWineRegularlyUntilRecentlyThenStopped,RecentlyPurchasedWine,RecentlyPurchasedMaternityClothing,IsPregnant
0,Female,Apartment,False,True,False,...,False,False,False,False,False
1,Male,Apartment,False,False,True,...,False,False,True,False,False
2,Male,Apartment,False,False,False,...,False,False,False,False,False
3,Male,Apartment,False,False,False,...,False,False,True,False,False
4,Male,Apartment,False,False,True,...,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
595,Female,PO Box,False,False,False,...,False,,False,False,True
596,Female,Home,False,False,False,...,False,False,False,False,True
597,Female,Apartment,False,False,False,...,False,False,False,True,True
598,Female,Home,False,False,True,...,False,False,False,False,True


In [4]:
df.describe()

Unnamed: 0,AccountHolderImpliedGender,AccountHolderAddress,RecentlyPurchasedPregnancyTest,RecentlyPurchasedBirthControl,RecentlyPurchasedFeminineHygieneProducts,...,RecentlyPurchasedSmokingCessationProducts,PurchasedWineRegularlyUntilRecentlyThenStopped,RecentlyPurchasedWine,RecentlyPurchasedMaternityClothing,IsPregnant
count,542,594,594,593,590,...,588,588,592,597,600
unique,2,3,2,2,2,...,2,2,2,2,2
top,Female,Home,False,False,False,...,False,False,False,False,True
freq,301,295,549,516,502,...,550,510,528,519,300


In [5]:
# we chose to drop all data that has NaN
df.dropna(inplace = True)

# convert all columns to strict binary
df = df * 1
df

Unnamed: 0,AccountHolderImpliedGender,AccountHolderAddress,RecentlyPurchasedPregnancyTest,RecentlyPurchasedBirthControl,RecentlyPurchasedFeminineHygieneProducts,...,RecentlyPurchasedSmokingCessationProducts,PurchasedWineRegularlyUntilRecentlyThenStopped,RecentlyPurchasedWine,RecentlyPurchasedMaternityClothing,IsPregnant
0,Female,Apartment,0,1,0,...,0,0,0,0,0
1,Male,Apartment,0,0,1,...,0,0,1,0,0
2,Male,Apartment,0,0,0,...,0,0,0,0,0
3,Male,Apartment,0,0,0,...,0,0,1,0,0
4,Male,Apartment,0,0,1,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
594,Female,Apartment,0,0,0,...,0,0,0,0,1
596,Female,Home,0,0,0,...,0,0,0,0,1
597,Female,Apartment,0,0,0,...,0,0,0,1,1
598,Female,Home,0,0,1,...,0,0,0,0,1


In [6]:
# we converted the gender column into a binary column
df.AccountHolderImpliedGender = df.AccountHolderImpliedGender.apply(lambda value: 1 if value == 'Female' else 0)

In [7]:
# applied one-hot encoding on the address column
df_address = pd.get_dummies(df.AccountHolderAddress, prefix="AccountHolderAddressType")

# drop the original address column
df.drop('AccountHolderAddress', inplace = True, axis = 1)

# join the one-hot encoded address columns
df = df.join([df_address])

# drop the third column since the two columns Home & PO Box convey the information
df.drop('AccountHolderAddressType_Apartment', inplace = True, axis = 1)
df

Unnamed: 0,AccountHolderImpliedGender,RecentlyPurchasedPregnancyTest,RecentlyPurchasedBirthControl,RecentlyPurchasedFeminineHygieneProducts,RecentlyPurchasedFolicAcidSupplements,...,RecentlyPurchasedWine,RecentlyPurchasedMaternityClothing,IsPregnant,AccountHolderAddressType_Home,AccountHolderAddressType_PO Box
0,1,0,1,0,0,...,0,0,0,0,0
1,0,0,0,1,0,...,1,0,0,0,0
2,0,0,0,0,0,...,0,0,0,0,0
3,0,0,0,0,0,...,1,0,0,0,0
4,0,0,0,1,0,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
594,1,0,0,0,0,...,0,0,1,0,0
596,1,0,0,0,0,...,0,0,1,1,0
597,1,0,0,0,0,...,0,1,1,0,0
598,1,0,0,1,0,...,0,0,1,1,0


In [8]:
# Trying to check the correlation between the new columns
df.corr().IsPregnant.sort_values(ascending = False)

IsPregnant                         1.000000
AccountHolderImpliedGender         0.131748
AccountHolderAddressType_Home      0.053855
AccountHolderAddressType_PO Box    0.009061
Name: IsPregnant, dtype: float64

In [9]:
# split the dataset into train and test
train_df = df.sample(frac = .6, random_state = 0)
test_df = df.drop(train_df.index)

In [10]:
train_df.columns

Index([u'AccountHolderImpliedGender', u'RecentlyPurchasedPregnancyTest',
       u'RecentlyPurchasedBirthControl',
       u'RecentlyPurchasedFeminineHygieneProducts',
       u'RecentlyPurchasedFolicAcidSupplements',
       u'RecentlyPurchasedPrenatalVitamins',
       u'RecentlyPurchasedPrenatalYogaDVD', u'RecentlyPurchasedBodyPillow',
       u'RecentlyPurchasedGingerAle', u'RecentlyPurchasedSeaBands',
       u'PurchasedCigarettesRegularlyUntilRecentlyThenStopped',
       u'RecentlyPurchasedCigarettes',
       u'RecentlyPurchasedSmokingCessationProducts',
       u'PurchasedWineRegularlyUntilRecentlyThenStopped',
       u'RecentlyPurchasedWine', u'RecentlyPurchasedMaternityClothing',
       u'IsPregnant', u'AccountHolderAddressType_Home',
       u'AccountHolderAddressType_PO Box'],
      dtype='object')

In [11]:
# chosse which columns to use as features. On experimenting a bit we found that removing the
# columns with household information (gender & address) gave the best scores
names_X = [
       #u'AccountHolderImpliedGender', 
       u'RecentlyPurchasedPregnancyTest',
       u'RecentlyPurchasedBirthControl',
       u'RecentlyPurchasedFeminineHygieneProducts',
       u'RecentlyPurchasedFolicAcidSupplements',
       u'RecentlyPurchasedPrenatalVitamins',
       u'RecentlyPurchasedPrenatalYogaDVD', 
       u'RecentlyPurchasedBodyPillow',
       u'RecentlyPurchasedGingerAle', 
       #u'RecentlyPurchasedSeaBands',
       u'PurchasedCigarettesRegularlyUntilRecentlyThenStopped',
       u'RecentlyPurchasedCigarettes',
       u'RecentlyPurchasedSmokingCessationProducts',
       u'PurchasedWineRegularlyUntilRecentlyThenStopped',
       u'RecentlyPurchasedWine', 
       u'RecentlyPurchasedMaternityClothing',
       #u'AccountHolderAddressType_Home', 
       #u'AccountHolderAddressType_PO Box'
    ]

def X_c(df):
    X = df[ names_X ]
    c = df.IsPregnant
    return X, c

train_X, train_c = X_c(train_df)
test_X, test_c = X_c(test_df)

In [12]:
# build the logistic regression model and check the coefficients
model = linear_model.LogisticRegression().fit(train_X, train_c)

print model.intercept_
print model.coef_

[-0.57444498]
[[ 1.58296651 -1.59992102 -1.60669318  2.19601495  1.89881057  0.79804497
   0.61730857  0.92824349  1.37753039 -0.84254834  1.2193017   1.03131206
  -0.98552805  1.81712857]]


In [13]:
# check the scores
print 'training misclassification =', 1 - model.score(train_X, train_c)
print 'testing  misclassification =', 1 - model.score(test_X, test_c)

training misclassification = 0.115671641791
testing  misclassification = 0.14606741573


In [14]:
# try to gather some insights
zip(names_X, np.exp(model.coef_[0]) - 1)

[(u'RecentlyPurchasedPregnancyTest', 3.869379488970309),
 (u'RecentlyPurchasedBirthControl', -0.79808753653692044),
 (u'RecentlyPurchasedFeminineHygieneProducts', -0.79945030043764664),
 (u'RecentlyPurchasedFolicAcidSupplements', 7.9891199750098245),
 (u'RecentlyPurchasedPrenatalVitamins', 5.677946778798348),
 (u'RecentlyPurchasedPrenatalYogaDVD', 1.2211941862779034),
 (u'RecentlyPurchasedBodyPillow', 0.8539315857608063),
 (u'RecentlyPurchasedGingerAle', 1.530061206526161),
 (u'PurchasedCigarettesRegularlyUntilRecentlyThenStopped', 2.9650972961545596),
 (u'RecentlyPurchasedCigarettes', -0.5693882197490745),
 (u'RecentlyPurchasedSmokingCessationProducts', 2.3848232766482744),
 (u'PurchasedWineRegularlyUntilRecentlyThenStopped', 1.8047434218712128),
 (u'RecentlyPurchasedWine', -0.62675791518585466),
 (u'RecentlyPurchasedMaternityClothing', 5.1541617873374159)]

In [15]:
# Do grid search to fit the same data for KNN
k_cv = 5 # 10-fold CV
k_nn = range(1, train_df.shape[0] * (k_cv - 1) / k_cv) # k-NN

gs = grid_search.GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(),
    param_grid = {'n_neighbors': k_nn},
    cv = cross_validation.KFold(train_df.shape[0], n_folds = k_cv)
)

gs.fit(train_X, train_c)

GridSearchCV(cv=sklearn.cross_validation.KFold(n=268, n_folds=5, shuffle=False, random_state=None),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7...194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [16]:
# find the best scores and estimator
print gs.best_score_
print gs.best_params_
gs.best_estimator_

0.809701492537
{'n_neighbors': 5}


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [17]:
# build the knn model and check the score
knn_model = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'uniform').fit(train_X, train_c)

print 'training misclassification =', 1 - knn_model.score(train_X, train_c)
print 'testing  misclassification =', 1 - knn_model.score(test_X, test_c)

training misclassification = 0.126865671642
testing  misclassification = 0.185393258427


### Looks like we can either do KNN or Logistic Regression in this case with the selected  features and get a similar accuracy