In [1]:
cd ..

/Users/davidtan/Code/thoughtworks/beach-projects/ai-sg-workshop/clean-code-ml


In [2]:
#source: https://www.kaggle.com/bhaveshsk/getting-started-with-titanic-dataset/data
#data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

#data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#machine learning packages
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [3]:
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")
df = pd.concat([train_df,test_df], sort=True)

df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [4]:
from src.preprocessing import add_derived_title

df = add_derived_title(df)

df['Title'] = df['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}).fillna(0)

In [5]:
freq_port = df.Embarked.dropna().mode()[0]
df['Embarked'] = df['Embarked'].fillna(freq_port)

In [6]:
# EXERCISE 2: Write a unit test and extract the following implementation into a function: 
# df = impute_nans(df, columns)

# 'Fare' column
df['Fare'] = df['Fare'].fillna(df['Fare'].dropna().median())

# 'Age' column
df['Age'] = df['Age'].fillna(df['Age'].dropna().median())

In [7]:
df['Sex'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [8]:
df['AgeBand'] = pd.cut(df['Age'], 5)

df.loc[ df['Age'] <= 16, 'Age'] = 0
df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3

df = df.drop(['AgeBand'], axis=1)

In [9]:
# EXERCISE 3: Write a unit test and extract the following implementation into a function: 
# df = add_is_alone_column(df)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

In [10]:
# drop unused columns
df = df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
df = df.drop(['Ticket', 'Cabin'], axis=1)
df = df.drop(['Name', 'PassengerId'], axis=1)

In [11]:
df['Age*Class'] = df.Age * df.Pclass

In [12]:
df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [13]:
df['FareBand'] = pd.qcut(df['Fare'], 4)

df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
df.loc[ df['Fare'] > 31, 'Fare'] = 3
df['Fare'] = df['Fare'].astype(int)

df = df.drop(['FareBand'], axis=1)

In [14]:
train_df = df[-df['Survived'].isna()]
test_df = df[df['Survived'].isna()]
test_df = test_df.drop('Survived', axis=1)

In [15]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.copy()

In [16]:
# EXERCISE 1: Create a function, train_model(...), to eliminate the duplication in the next few cells
from src.preprocessing import train_model

_, acc_svc           = train_model(SVC, X_train, Y_train, gamma='auto')
_, acc_knn           = train_model(KNeighborsClassifier, X_train, Y_train, n_neighbors=3)
_, acc_gaussian      = train_model(GaussianNB, X_train, Y_train)
_, acc_perceptron    = train_model(Perceptron, X_train, Y_train)
_, acc_sgd           = train_model(SGDClassifier, X_train, Y_train)
_, acc_decision_tree = train_model(DecisionTreeClassifier, X_train, Y_train)
_, acc_random_forest = train_model(RandomForestClassifier, X_train, Y_train, n_estimators=100)

accuracy (SVC): 83.84
accuracy (KNeighborsClassifier): 84.51
accuracy (GaussianNB): 71.83
accuracy (Perceptron): 75.42
accuracy (SGDClassifier): 74.41
accuracy (DecisionTreeClassifier): 86.98
accuracy (RandomForestClassifier): 86.98


In [17]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent',
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
2,Random Forest,86.98
6,Decision Tree,86.98
1,KNN,84.51
0,Support Vector Machines,83.84
4,Perceptron,75.42
5,Stochastic Gradient Decent,74.41
3,Naive Bayes,71.83
