In [1]:
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

In [2]:
import numpy as np
import pandas as pd

train_data = pd.read_csv('../PyLearning/titanic/train.csv')
test_data = pd.read_csv('../PyLearning/titanic/test.csv')

In [3]:
#observing the dataset

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
#examine missing values
num_vars = train_data.columns[train_data.dtypes != 'object']
cat_vars = train_data.columns[train_data.dtypes == 'object']
train_data[num_vars].isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Age            177
SibSp            0
Parch            0
Fare             0
dtype: int64

In [6]:
train_data[cat_vars].isnull().sum()

#so after examining the training set, the 'Age' and 'Embarked' columns need to be dealt with on missing values ('Cabin' will not be used)

Name          0
Sex           0
Ticket        0
Cabin       687
Embarked      2
dtype: int64

In [7]:
#examine missing values (for test set)
num_vars_test = test_data.columns[test_data.dtypes != 'object']
cat_vars_test = test_data.columns[test_data.dtypes == 'object']
test_data[num_vars_test].isnull().sum()

PassengerId     0
Pclass          0
Age            86
SibSp           0
Parch           0
Fare            1
dtype: int64

In [8]:
test_data[cat_vars_test].isnull().sum()

#so after examining the testing set, the 'Age' and 'Fare' columns need to be dealt with on missing values

Name          0
Sex           0
Ticket        0
Cabin       327
Embarked      0
dtype: int64

In [9]:
train_data = train_data.dropna(subset = ['Embarked']) #drop two rows
#impute missing ages for both training and testing set, using mean
train_data['Age'].fillna(round(train_data['Age'].dropna().mean()), inplace = True)
test_data['Age'].fillna(round(train_data['Age'].dropna().mean()), inplace = True)

In [10]:
# to complete __ prediction, two columns of data need to be added to the dataframe 

train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

train_data['personAlone'] = train_data['FamilySize'].map({1:1})
train_data.fillna(0, inplace = True)
test_data['personAlone'] = test_data['FamilySize'].map({1:1})
test_data.fillna(0, inplace = True)

In [11]:
#split age and fare into bands to prepare for normalization

#age

np.quantile(train_data['Age'], [.25, .5, .75], axis = 0)


array([22., 30., 35.])

In [12]:
train_data['Age'].agg(['min', 'max'])

min     0.42
max    80.00
Name: Age, dtype: float64

In [13]:
bins_age = [0, 22, 30, 35, 80]
labels = [0, 1, 2, 3]
train_data['AgeBand'] = pd.cut(train_data['Age'], bins = bins_age, labels = labels, right = False)
test_data['AgeBand'] = pd.cut(test_data['Age'], bins = bins_age, labels = labels, right = False)

In [14]:
#fare 

np.quantile(train_data['Fare'], [.25, .5, .75], axis = 0)

array([ 7.8958, 14.4542, 31.    ])

In [15]:
train_data['Fare'].agg(['min', 'max'])

min      0.0000
max    512.3292
Name: Fare, dtype: float64

In [16]:
bins_fare = [0, 8, 15, 31, 513]
train_data['FareBand'] = pd.cut(train_data['Fare'], bins = bins_fare, labels = labels, right = False)
test_data['FareBand'] = pd.cut(test_data['Fare'], bins = bins_fare, labels = labels, right = False)

In [17]:
#using StandardScaler to normalize

scaler = StandardScaler()
features = ['Pclass', 'AgeBand', 'FareBand', 'FamilySize']
train_data[features] = scaler.fit_transform(train_data[features])
test_data[features] = scaler.fit_transform(test_data[features])

In [18]:
#encode categorical variables

#map sexes to 0 and 1

train_data['Sex'] = train_data['Sex'].map({'male':1, 'female':0})
test_data['Sex'] = test_data['Sex'].map({'male':1, 'female':0})

#use dummies to replace 'Embarked'
train_data['Embarked'] = pd.get_dummies(train_data, columns = ['Embarked'])
test_data['Embarked'] = pd.get_dummies(test_data, columns = ['Embarked'])

In [19]:
#train test split
X_train = train_data.drop(columns = ['Survived','Name','Age','SibSp','Parch','Fare','PassengerId','Ticket','Cabin'])
y_train = train_data['Survived']
X_test = test_data.drop(columns = ['Name','Age','SibSp','Parch','Fare','Ticket','PassengerId','Cabin'])

In [20]:
X_train

Unnamed: 0,Pclass,Sex,Embarked,FamilySize,personAlone,AgeBand,FareBand
0,0.825209,1,1,0.057853,0.0,0.033558,-1.289125
1,-1.572211,0,2,0.057853,0.0,0.033558,1.349475
2,0.825209,0,3,-0.561804,1.0,0.033558,-1.289125
3,-1.572211,0,4,0.057853,0.0,0.033558,1.349475
4,0.825209,1,5,-0.561804,1.0,0.033558,-0.409592
...,...,...,...,...,...,...,...
886,-0.373501,1,887,-0.561804,1.0,0.033558,-0.409592
887,-1.572211,0,888,-0.561804,1.0,0.033558,0.469942
888,0.825209,0,889,1.297169,0.0,0.033558,0.469942
889,-1.572211,1,890,-0.561804,1.0,0.033558,0.469942


In [21]:
# use KNN

knn = KNeighborsClassifier()
params = {'n_neighbors':np.arange(2, 11),
         'p':[1,2],
         'weights':['uniform', 'distance']}
#initialize CV
cv = GridSearchCV(estimator = knn, param_grid = params, scoring = 'accuracy', cv = 5)

cv.fit(X_train, y_train)
cv.best_params_

{'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}

In [22]:
# cross validation (results not very good...)
knn_best = KNeighborsClassifier(n_neighbors = 9, p = 1, weights = 'uniform')
cv_res = cross_validate(knn_best, X_train, y_train, scoring = 'accuracy', cv = 5)
cv_res['test_score'].mean()

0.5265346283247635

In [23]:
# make predictions

knn_best.fit(X_train, y_train)
y_pred = knn_best.predict(X_test)
subs = pd.read_csv('../PyLearning/titanic/gender_submission.csv')
subs['Survived'] = y_pred
subs.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [None]:
# predicts are all 0