Use the Titanic dataset and evaluate if KNN produces a good classifier for this dataset

In [46]:
import pandas as pd
import numpy as np
from sklearn import neighbors
from sklearn import cross_validation
from sklearn import metrics

In [2]:
# Load data
df = pd.read_csv('../../assets/data/titanic.csv')

In [3]:
# Keep only the features that matter (that makes sense predicting on)

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
[(c, df[c].value_counts().count()) for c in list(df.columns)]

[('PassengerId', 891),
 ('Survived', 2),
 ('Pclass', 3),
 ('Name', 891),
 ('Sex', 2),
 ('Age', 88),
 ('SibSp', 7),
 ('Parch', 7),
 ('Ticket', 681),
 ('Fare', 248),
 ('Cabin', 147),
 ('Embarked', 3)]

In [6]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df['Fare']=df['Fare'].astype(float)

In [11]:
df[df['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [35]:
df['Embarked'].mode()[0]

'S'

In [34]:
df.ix[0,'Embarked']

'S'

In [27]:
df[df['Embarked'].isnull()==False].mode()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,,0.0,3.0,,male,29.699118,0.0,0.0,1601,8.05,B96 B98,S
1,,,,,,,,,347082,,C23 C25 C27,
2,,,,,,,,,CA. 2343,,G6,


In [36]:
df.ix[df['Embarked'].isnull(),'Embarked'] = df['Embarked'].mode()[0]

In [32]:
df[df['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [33]:
df['Age'].mean()

29.699117647058763

In [16]:
df.ix[df['Age'].isnull(),'Age'] = df['Age'].mean()

In [15]:
df.groupby(by=['Sex'])[['Age']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
Sex,Embarked,Unnamed: 2_level_1
female,C,28.344262
female,Q,24.291667
female,S,27.771505
male,C,32.998841
male,Q,30.9375
male,S,30.29144


In [4]:
# Get it ready for classification (dummies + nan)
df.columns


Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')

In [None]:
df.reindex(columns=[u'Pclass', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'])

In [41]:
y = df['Survived']

In [39]:
X = df[['Age','Fare']]

In [40]:
categories = [u'Pclass', u'Sex',
       u'SibSp', u'Parch', u'Embarked']

for cat in categories :
    series = df[cat]
    dummies = pd.get_dummies(series, prefix=cat)
    X = pd.concat([X, dummies], axis=1)
    

In [42]:
cv = cross_validation.StratifiedKFold(y, n_folds=5)

In [43]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)

In [44]:
cross_validation.cross_val_score(knn, X=X, y=y, cv=cv)

array([ 0.67039106,  0.69832402,  0.73595506,  0.73595506,  0.68361582])

In [47]:
y_pred = cross_validation.cross_val_predict(knn, X=X, y=y, cv=cv)
print metrics.accuracy_score(y_true=y, y_pred=y_pred)
print metrics.confusion_matrix(y_true=y, y_pred=y_pred)

0.704826038159
[[437 112]
 [151 191]]


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [None]:
# get X and y



In [None]:
# Cross validate knn with 10 fold cross validation

