# Titanic Predictions

Here, I will try some classifiers to see which perform the best on the training data, and whichever performs the best will be the one I use for the test data.

First, I have to import the libraries.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

Import dataset

In [3]:
dataset = pd.read_csv("train.csv")

In [4]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Well, I probably don't need PassengerId, Ticket or Name for any predictions, so let's drop those columns.

I don't think Ticket, Cabin or Parch will have useful information either.

In [5]:
dataset=dataset.drop(["PassengerId","Name","Ticket","Cabin", "Parch", "SibSp"], axis=1)

In [6]:
dataset.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
Fare        float64
Embarked     object
dtype: object

Embarked has a few NaNs, so I will drop those.

In [7]:
dataset = dataset[dataset['Embarked'].notna()]

Ok, so Survived is our y. Our numerical features are Age, Fare and Sibsp, and categorical features are Pclass, Sex and Embarked 

I will first do in this file what I am used to, and in the next file look into the pipelines. In the course, everything starts with breaking it down into X and y, so not using a data frame instead applying the objects to a numpy array, which I kinda find weird now that I think about it.

In [8]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

In [9]:
print(X)
print(y)

[[3 'male' 22.0 7.25 'S']
 [1 'female' 38.0 71.2833 'C']
 [3 'female' 26.0 7.925 'S']
 ...
 [3 'female' nan 23.45 'S']
 [1 'male' 26.0 30.0 'C']
 [3 'male' 32.0 7.75 'Q']]
[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0
 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0 0
 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0 0
 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0
 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1
 0 

In [10]:
#Imputer to fill in missing numerical data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
#the missing values are np.nan, and we replace them with the 'mean' of other elements in the column
imputer.fit(X[:, 2:4])
X[:, 2:4] = imputer.transform(X[:, 2:4])

We have to wait to apply standard scaling until after we split the data into training and test sets.

In [11]:
#Encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1,4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [12]:
print(X)

[[0.0 0.0 1.0 ... 1.0 22.0 7.25]
 [1.0 0.0 0.0 ... 0.0 38.0 71.2833]
 [0.0 0.0 1.0 ... 1.0 26.0 7.925]
 ...
 [0.0 0.0 1.0 ... 1.0 28.0 23.45]
 [1.0 0.0 0.0 ... 0.0 26.0 30.0]
 [0.0 0.0 1.0 ... 0.0 32.0 7.75]]


# Split into train and test sets

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Feature Scaling

In [14]:
X_train

array([[0.0, 0.0, 1.0, ..., 1.0, 28.0, 8.05],
       [0.0, 0.0, 1.0, ..., 1.0, 19.0, 10.1708],
       [0.0, 0.0, 1.0, ..., 0.0, 28.0, 7.75],
       ...,
       [0.0, 0.0, 1.0, ..., 0.0, 26.0, 14.4542],
       [0.0, 1.0, 0.0, ..., 1.0, 44.0, 26.0],
       [0.0, 0.0, 1.0, ..., 1.0, 21.0, 8.05]], dtype=object)

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, -2:] = sc.fit_transform(X_train[:, -2:])
X_test[:, -2:] = sc.transform(X_test[:, -2:])

# Classifier Models

I realized I forgot to include the "Embarked" column in my original code. I added it back in and now logistic regression is better?

In [16]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rfc.fit(X_train, y_train)
y_p1= rfc.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_p1)
print(cm)
accuracy_score(y_test, y_p1)

[[92 13]
 [22 51]]


0.8033707865168539

In [17]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 80.02 %
Standard Deviation: 2.45 %


I think this is better than logistic regression, but let's see.

In [18]:
from sklearn.linear_model import LogisticRegression
logR = LogisticRegression(random_state = 0)
logR.fit(X_train, y_train)
y_p2= logR.predict(X_test)
cm = confusion_matrix(y_test, y_p2)
print(cm)
accuracy_score(y_test, y_p2)

[[90 15]
 [18 55]]


0.8146067415730337

In [19]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 6, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)
y_p3= knn.predict(X_test)
cm = confusion_matrix(y_test, y_p3)
print(cm)
accuracy_score(y_test, y_p3)

[[95 10]
 [26 47]]


0.797752808988764

In [20]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dtc.fit(X_train, y_train)
y_p4= dtc.predict(X_test)
cm = confusion_matrix(y_test, y_p4)
print(cm)
accuracy_score(y_test, y_p4)

[[88 17]
 [22 51]]


0.7808988764044944

In [21]:
from sklearn.svm import SVC
ksvm = SVC(kernel = 'rbf', random_state = 0)
ksvm.fit(X_train, y_train)
y_p5= ksvm.predict(X_test)
cm = confusion_matrix(y_test, y_p5)
print(cm)
accuracy_score(y_test, y_p5)

[[98  7]
 [25 48]]


0.8202247191011236

In [22]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_train, y_train)
y_p5= svm.predict(X_test)
cm = confusion_matrix(y_test, y_p5)
print(cm)
accuracy_score(y_test, y_p5)

[[93 12]
 [18 55]]


0.8314606741573034

In [23]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
y_p6= nb.predict(X_test)
cm = confusion_matrix(y_test, y_p6)
print(cm)
accuracy_score(y_test, y_p6)

[[84 21]
 [13 60]]


0.8089887640449438

## Cross Validation

In [24]:
from sklearn.model_selection import cross_val_score

We can see which is best using k-fold cross validation.

In [25]:
cl_list = [rfc,logR,knn,dtc,ksvm,svm,nb]

for clf in cl_list:
    print(clf)
    accuracies = cross_val_score(estimator = clf, X = X_train, y = y_train, cv = 10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
    print()


RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
Accuracy: 80.02 %
Standard Deviation: 2.45 %

LogisticRegression(random_state=0)
Accuracy: 79.04 %
Standard Deviation: 2.59 %

KNeighborsClassifier(n_neighbors=6)
Accuracy: 80.16 %
Standard Deviation: 4.09 %

DecisionTreeClassifier(criterion='entropy', random_state=0)
Accuracy: 76.22 %
Standard Deviation: 5.00 %

SVC(random_state=0)
Accuracy: 81.71 %
Standard Deviation: 3.47 %

SVC(kernel='linear', random_state=0)
Accuracy: 77.49 %
Standard Deviation: 3.36 %

GaussianNB()
Accuracy: 75.95 %
Standard Deviation: 2.97 %



Hmmmmm, of these, KSVM with rbf is the best, but I guess I could do some parameter tweaking for logistic regression and KNN...

Wow, with the original pipe method, you can tweak the preprocessor settings too thats cool.

In [26]:
#param_grid = {
#    "preprocessor__num__imputer__strategy": ["mean", "median"],
#    "classifier__C": [0.1, 1.0, 10, 100],
#}

#grid_search = GridSearchCV(logR, param_grid, cv=10)
#grid_search

In [27]:
grid_search = GridSearchCV(logR,{"C": [0.1, 1.0, 10, 100]}, cv=10)
grid_search

In [28]:
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

Best params:
{'C': 0.1}


In [29]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[
    [
        "mean_test_score",
        "std_test_score",
        "param_C",
    ]
].head(5)

Unnamed: 0,mean_test_score,std_test_score,param_C
0,0.794621,0.03181,0.1
2,0.793192,0.023995,10.0
3,0.793192,0.023995,100.0
1,0.790376,0.025929,1.0


In [30]:
logR=LogisticRegression(C=0.1)
print(logR)
accuracies = cross_val_score(estimator = logR, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
print()

LogisticRegression(C=0.1)
Accuracy: 79.46 %
Standard Deviation: 3.18 %



Well... it went from 79.04 to 79.46 so I guess that is a slight improvement

In [31]:
grid_search = GridSearchCV(rfc,{"n_estimators": [5, 10, 15, 20, 30]}, cv=10)
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)


Best params:
{'n_estimators': 20}


In [32]:
rfc = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
print(rfc)
accuracies = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
print()

RandomForestClassifier(criterion='entropy', n_estimators=20, random_state=0)
Accuracy: 80.73 %
Standard Deviation: 2.63 %



Well, that still leaves KSVM with RBF as the best model so far. I thought it would be rfc...

In [33]:
from sklearn.svm import SVC
ksvm = SVC(kernel = 'rbf', random_state = 0)
ksvm.fit(X_train, y_train)
y_p5= ksvm.predict(X_test)
cm = confusion_matrix(y_test, y_p5)
print(cm)
accuracy_score(y_test, y_p5)

[[98  7]
 [25 48]]


0.8202247191011236

In [34]:
y_p5

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0])

So... gotta export this I guess. But what is actually expected? They have an example file so lets see what that is.

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [36]:
y_p5

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0])

In [37]:
X_test

array([[0.0, 1.0, 0.0, ..., 1.0, 0.49352544885255145,
        -0.4244901451208124],
       [0.0, 1.0, 0.0, ..., 1.0, 1.5568300491794511,
        -0.10127665229686339],
       [0.0, 0.0, 1.0, ..., 1.0, 1.4049293919898942, -0.5524279906110798],
       ...,
       [1.0, 0.0, 0.0, ..., 0.0, -0.5697791514743482, 2.6244896417658943],
       [0.0, 0.0, 1.0, ..., 1.0, -0.5697791514743482,
        -0.5475598981576237],
       [0.0, 1.0, 0.0, ..., 1.0, -0.4178784942847912,
        -0.48664658604849487]], dtype=object)