# Titanic Practice

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [28]:
titanic_train = pd.read_csv('../train.csv')
# titanic_test = pd.read_csv('../test.csv')
# we will use the training set as our full data set to 
# implement the steps of splitting data

In [29]:
# encode categorial variables here
titanic_train = pd.get_dummies(titanic_train, columns=['Sex', 'Pclass' ], prefix = ['Sex', 'Class'])

In [30]:
titanic_train.shape

(891, 15)

In [31]:
titanic_train.head(10)

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_female,Sex_male,Class_1,Class_2,Class_3
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,0,1,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,1,0,1,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,1,0,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,1,0,1,0,0
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,0,1,0,0,1
5,6,0,"Moran, Mr. James",,0,0,330877,8.4583,,Q,0,1,0,0,1
6,7,0,"McCarthy, Mr. Timothy J",54.0,0,0,17463,51.8625,E46,S,0,1,1,0,0
7,8,0,"Palsson, Master. Gosta Leonard",2.0,3,1,349909,21.075,,S,0,1,0,0,1
8,9,1,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0,0,2,347742,11.1333,,S,1,0,0,0,1
9,10,1,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,1,0,237736,30.0708,,C,1,0,0,1,0


In [44]:
# keep it simple, use class, age, sex, and fare as predictors
dataset = titanic_train[['Class_1', 'Class_2', 'Sex_male', 'Age', 'Fare', 'Survived']]

In [45]:
print(dataset)

     Class_1  Class_2  Sex_male   Age     Fare  Survived
0          0        0         1  22.0   7.2500         0
1          1        0         0  38.0  71.2833         1
2          0        0         0  26.0   7.9250         1
3          1        0         0  35.0  53.1000         1
4          0        0         1  35.0   8.0500         0
..       ...      ...       ...   ...      ...       ...
886        0        1         1  27.0  13.0000         0
887        1        0         0  19.0  30.0000         1
888        0        0         0   NaN  23.4500         0
889        1        0         1  26.0  30.0000         1
890        0        0         1  32.0   7.7500         0

[891 rows x 6 columns]


## Deal with missing data

In [46]:
# Count total NaN at each column in a DataFrame 
print(" \nCount total NaN at each column in a DataFrame : \n\n", 
      dataset.isnull().sum()) 

 
Count total NaN at each column in a DataFrame : 

 Class_1       0
Class_2       0
Sex_male      0
Age         177
Fare          0
Survived      0
dtype: int64


In [47]:
# remove observations from X where age is NaN, 714 rows left
dataset = dataset.dropna()

In [49]:
print(dataset)

     Class_1  Class_2  Sex_male   Age     Fare  Survived
0          0        0         1  22.0   7.2500         0
1          1        0         0  38.0  71.2833         1
2          0        0         0  26.0   7.9250         1
3          1        0         0  35.0  53.1000         1
4          0        0         1  35.0   8.0500         0
..       ...      ...       ...   ...      ...       ...
885        0        0         0  39.0  29.1250         0
886        0        1         1  27.0  13.0000         0
887        1        0         0  19.0  30.0000         1
889        1        0         1  26.0  30.0000         1
890        0        0         1  32.0   7.7500         0

[714 rows x 6 columns]


## Split into X and y

In [58]:
X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [51]:
print(X)

     Class_1  Class_2  Sex_male   Age     Fare
0          0        0         1  22.0   7.2500
1          1        0         0  38.0  71.2833
2          0        0         0  26.0   7.9250
3          1        0         0  35.0  53.1000
4          0        0         1  35.0   8.0500
..       ...      ...       ...   ...      ...
885        0        0         0  39.0  29.1250
886        0        1         1  27.0  13.0000
887        1        0         0  19.0  30.0000
889        1        0         1  26.0  30.0000
890        0        0         1  32.0   7.7500

[714 rows x 5 columns]


In [52]:
print(y)

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: Survived, Length: 714, dtype: int64


## Split Data into Train and Test

In [59]:
from sklearn.model_selection import train_test_split
X=X.values
y=y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Scale Variables

In [66]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# only transform age and fare, not categorical variables!
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.transform(X_test[:,3:])

In [67]:
print(X_train)

[[ 0.          0.          0.         -1.02749969 -0.3792988 ]
 [ 0.          0.          1.         -0.40808323 -0.19791373]
 [ 0.          1.          1.          0.45221741 -0.16330468]
 ...
 [ 0.          0.          1.         -0.27043513 -0.26479337]
 [ 1.          0.          0.         -0.75220349  1.05417044]
 [ 0.          0.          1.         -0.61455538 -0.50378285]]


In [68]:
print(X_test)

[[ 1.00000000e+00  0.00000000e+00  0.00000000e+00 -1.02749969e+00
   3.30391365e+00]
 [ 0.00000000e+00  0.00000000e+00  1.00000000e+00 -1.57809210e+00
  -1.04843456e-01]
 [ 0.00000000e+00  1.00000000e+00  0.00000000e+00  2.11333229e-01
  -1.63304681e-01]
 [ 0.00000000e+00  1.00000000e+00  0.00000000e+00 -1.09632374e+00
  -8.71498152e-02]
 [ 0.00000000e+00  0.00000000e+00  1.00000000e+00 -6.14555385e-01
  -4.99105956e-01]
 [ 0.00000000e+00  1.00000000e+00  1.00000000e+00  8.30749688e-01
  -4.06503376e-01]
 [ 0.00000000e+00  1.00000000e+00  0.00000000e+00 -4.76907282e-01
  -3.91692576e-01]
 [ 1.00000000e+00  0.00000000e+00  1.00000000e+00  1.79428640e+00
  -1.53015505e-01]
 [ 0.00000000e+00  1.00000000e+00  1.00000000e+00  6.24277535e-01
  -4.06503376e-01]
 [ 0.00000000e+00  1.00000000e+00  1.00000000e+00 -4.08083231e-01
  -4.06503376e-01]
 [ 0.00000000e+00  1.00000000e+00  1.00000000e+00 -2.01611078e-01
  -1.63304681e-01]
 [ 0.00000000e+00  0.00000000e+00  1.00000000e+00 -5.45731333e-01

## Train Classification Models

In [69]:
# import packages from machine learning course
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [70]:
# logistic regression
classifier_logistic = LogisticRegression(random_state = 0)
classifier_logistic.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [71]:
# k nearest neighbors
classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

KNeighborsClassifier()

In [72]:
# support vector machine linear kernal
classifier_svm = SVC(kernel = 'linear', random_state = 0)
classifier_svm.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [73]:
# support vector machine non-linear kernal
classifier_svm2 = SVC(kernel = 'rbf', random_state = 0)
classifier_svm2.fit(X_train, y_train)

SVC(random_state=0)

In [74]:
# decision tree
classifier_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [75]:
# random forest
classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rf.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [76]:
# naive bayes
classifier_nb = GaussianNB()
classifier_nb.fit(X_train, y_train)

GaussianNB()

## Accuracy on Training Data?

In [77]:
# import packages
from sklearn.metrics import confusion_matrix, accuracy_score

In [78]:
y_pred = classifier_logistic.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[288  50]
 [ 63 170]]


0.8021015761821366

In [79]:
y_pred = classifier_knn.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[301  37]
 [ 51 182]]


0.8458844133099825

In [80]:
y_pred = classifier_svm.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[286  52]
 [ 72 161]]


0.7828371278458844

In [81]:
y_pred = classifier_svm2.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[306  32]
 [ 75 158]]


0.8126094570928196

In [82]:
y_pred = classifier_tree.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[338   0]
 [  8 225]]


0.9859894921190894

In [83]:
y_pred = classifier_rf.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[329   9]
 [  7 226]]


0.9719789842381786

In [84]:
y_pred = classifier_nb.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[267  71]
 [ 60 173]]


0.7705779334500875

## But more important: Accuracy on test set??

In [85]:
y_pred = classifier_logistic.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[75 11]
 [20 37]]


0.7832167832167832

In [86]:
y_pred = classifier_knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[81  5]
 [14 43]]


0.8671328671328671

In [87]:
y_pred = classifier_svm.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[74 12]
 [21 36]]


0.7692307692307693

In [88]:
y_pred = classifier_svm2.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[78  8]
 [20 37]]


0.8041958041958042

In [89]:
y_pred = classifier_tree.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[75 11]
 [12 45]]


0.8391608391608392

In [90]:
y_pred = classifier_rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[75 11]
 [16 41]]


0.8111888111888111

In [91]:
y_pred = classifier_nb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[71 15]
 [19 38]]


0.7622377622377622

K-Nearest Neighbors does best on test set!