In [27]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

In [36]:
df = pd.read_csv(os.path.join('data', 'train.csv'), index_col=0)
df['Deck'] = df['Cabin'].dropna().str[0]

encs = {}
for col in ['Sex', 'Embarked', 'Deck']:
    encs[col] = LabelEncoder()
    df.loc[pd.isnull(df[col]), col]  = 'NaN'
    df[col] = encs[col].fit_transform(df[col])
    print(col, encs[col].classes_)

Sex ['female' 'male']
Embarked ['C' 'NaN' 'Q' 'S']
Deck ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'NaN' 'T']


In [37]:
# Impute missing age values (sample from no-nan distribution)
df['Age'] = df['Age'].apply(lambda x: int(df['Age'].dropna().sample()) if np.isnan(x) else x)

In [89]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,3,7
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0,2
3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,3,7
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,3,2
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,3,7


In [38]:
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Deck']]
y = df['Survived']

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
clf.score(X_test, y_test)

0.7787114845938375

In [63]:
clf = svm.LinearSVC(C=0.01, max_iter=1e5)
scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.789 (+/- 0.029)


In [98]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=35)
scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.696 (+/- 0.085)


In [62]:
clf = svm.SVC(kernel='rbf', C=1e4)
scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.802 (+/- 0.024)


In [75]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=400)
scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.810 (+/- 0.053)


In [158]:
import lightgbm as lgbm
clf = lgbm.LGBMClassifier(num_leaves=21, max_depth=-1, learning_rate=0.01, n_estimators=500, min_child_samples=20)
scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.833 (+/- 0.066)
