# Titanic Predictions

Here, I will try some classifiers to see which perform the best on the training data, and whichever performs the best will be the one I use for the test data.

First, I have to import the libraries.

In [33]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Import dataset

In [34]:
dataset = pd.read_csv("train.csv")

In [35]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Well, I probably don't need PassengerId, Ticket or Name for any predictions, so let's drop those columns.

I don't think Ticket will have useful information either.

In [36]:
dataset["Ticket"]

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
             ...       
886              211536
887              112053
888          W./C. 6607
889              111369
890              370376
Name: Ticket, Length: 891, dtype: object

In [37]:
dataset=dataset.drop(["PassengerId","Name","Ticket"], axis=1)
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


Now, need to convert "Sex", "Cabin" and "Embarked" into levels, normalize "Fare" and deal with NaNs

In [42]:
dataset["Cabin"].isnull().sum()

687

There are a lot of NaNs...

I'm not really sure what this means, I am going to interpret it as "None".

In [63]:
dataset["Cabin"] = dataset["Cabin"].fillna("None")
dataset["Cabin"]

0      None
1       C85
2      None
3      C123
4      None
       ... 
886    None
887     B42
888    None
889    C148
890    None
Name: Cabin, Length: 891, dtype: object

In [64]:
dataset.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin         0
Embarked      2
dtype: int64

In [58]:
dataset[ dataset["Age"].isna() ]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
5,0,3,male,,0,0,8.4583,,Q
17,1,2,male,,0,0,13.0000,,S
19,1,3,female,,0,0,7.2250,,C
26,0,3,male,,0,0,7.2250,,C
28,1,3,female,,0,0,7.8792,,Q
...,...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,,C
863,0,3,female,,8,2,69.5500,,S
868,0,3,male,,0,0,9.5000,,S
878,0,3,male,,0,0,7.8958,,S


If I just replace the age with 0, it will really skew the results...

I will here replace these with the average age.

I think a better way may be to take a sample of the rest of the ages, but I can try that later.

In [71]:
dataset["Age"] = dataset["Age"].fillna(dataset["Age"].mean())

In [72]:
dataset.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    2
dtype: int64

There are a couple that have NaN as Embarked

In [59]:
dataset[ dataset["Embarked"].isna() ]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
61,1,1,female,38.0,0,0,80.0,B28,
829,1,1,female,62.0,0,0,80.0,B28,


I will just take these off, and these are the only NaN left so I can do...

In [83]:
dataset=dataset.dropna()
dataset.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [84]:
dataset

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.000000,1,0,7.2500,,S
1,1,1,female,38.000000,1,0,71.2833,C85,C
2,1,3,female,26.000000,0,0,7.9250,,S
3,1,1,female,35.000000,1,0,53.1000,C123,S
4,0,3,male,35.000000,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,,S
887,1,1,female,19.000000,0,0,30.0000,B42,S
888,0,3,female,29.699118,1,2,23.4500,,S
889,1,1,male,26.000000,0,0,30.0000,C148,C


So all of the NaN are dealt with. We can now split the data into training and test sets.

## Splitting data into training and test sets

In [87]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, 0].values
#print(X)
#print(y)

In [88]:
print(X)

[[3 'male' 22.0 ... 0 7.25 'None']
 [1 'female' 38.0 ... 0 71.2833 'C85']
 [3 'female' 26.0 ... 0 7.925 'None']
 ...
 [3 'female' 29.69911764705882 ... 2 23.45 'None']
 [1 'male' 26.0 ... 0 30.0 'C148']
 [3 'male' 32.0 ... 0 7.75 'None']]


In [89]:
print(y)

[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0
 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0 0
 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0 0
 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0
 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1
 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1
 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0
 0 1 1 0 1 0 0 1 0 0 0 0 

Now, we have to encode the categorical information.

## Encoding Categorical Information

[[3 'male' 22.0 ... 0 7.25 'None']
 [1 'female' 38.0 ... 0 71.2833 'C85']
 [3 'female' 26.0 ... 0 7.925 'None']
 ...
 [3 'female' 29.69911764705882 ... 2 23.45 'None']
 [1 'male' 26.0 ... 0 30.0 'C148']
 [3 'male' 32.0 ... 0 7.75 'None']]


In [154]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, 0].values
#print(X)
#print(y)
print(X)
print()

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(categories="auto"), [1,6])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[3 'male' 22.0 ... 0 7.25 'None']
 [1 'female' 38.0 ... 0 71.2833 'C85']
 [3 'female' 26.0 ... 0 7.925 'None']
 ...
 [3 'female' 29.69911764705882 ... 2 23.45 'None']
 [1 'male' 26.0 ... 0 30.0 'C148']
 [3 'male' 32.0 ... 0 7.75 'None']]

  (0, 1)	1.0
  (0, 147)	1.0
  (0, 149)	3.0
  (0, 150)	22.0
  (0, 151)	1.0
  (0, 153)	7.25
  (1, 0)	1.0
  (1, 82)	1.0
  (1, 149)	1.0
  (1, 150)	38.0
  (1, 151)	1.0
  (1, 153)	71.2833
  (2, 0)	1.0
  (2, 147)	1.0
  (2, 149)	3.0
  (2, 150)	26.0
  (2, 153)	7.925
  (3, 0)	1.0
  (3, 56)	1.0
  (3, 149)	1.0
  (3, 150)	35.0
  (3, 151)	1.0
  (3, 153)	53.1
  (4, 1)	1.0
  (4, 147)	1.0
  :	:
  (884, 149)	2.0
  (884, 150)	27.0
  (884, 153)	13.0
  (885, 0)	1.0
  (885, 31)	1.0
  (885, 149)	1.0
  (885, 150)	19.0
  (885, 153)	30.0
  (886, 0)	1.0
  (886, 147)	1.0
  (886, 149)	3.0
  (886, 150)	29.69911764705882
  (886, 151)	1.0
  (886, 152)	2.0
  (886, 153)	23.45
  (887, 1)	1.0
  (887, 61)	1.0
  (887, 149)	1.0
  (887, 150)	26.0
  (887, 153)	30.0
  (888, 1)	1.0
  (888, 14

In [151]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, 0].values
#print(X)
#print(y)
print(X)
print()

[[3 'male' 22.0 ... 0 7.25 'None']
 [1 'female' 38.0 ... 0 71.2833 'C85']
 [3 'female' 26.0 ... 0 7.925 'None']
 ...
 [3 'female' 29.69911764705882 ... 2 23.45 'None']
 [1 'male' 26.0 ... 0 30.0 'C148']
 [3 'male' 32.0 ... 0 7.75 'None']]



In [155]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, 0].values
#print(X)
#print(y)
print(X)
print()

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(categories="auto"), [1,6])], remainder='passthrough')
print(ct.fit_transform(X))

[[3 'male' 22.0 ... 0 7.25 'None']
 [1 'female' 38.0 ... 0 71.2833 'C85']
 [3 'female' 26.0 ... 0 7.925 'None']
 ...
 [3 'female' 29.69911764705882 ... 2 23.45 'None']
 [1 'male' 26.0 ... 0 30.0 'C148']
 [3 'male' 32.0 ... 0 7.75 'None']]

  (0, 1)	1.0
  (0, 147)	1.0
  (0, 149)	3.0
  (0, 150)	22.0
  (0, 151)	1.0
  (0, 153)	7.25
  (1, 0)	1.0
  (1, 82)	1.0
  (1, 149)	1.0
  (1, 150)	38.0
  (1, 151)	1.0
  (1, 153)	71.2833
  (2, 0)	1.0
  (2, 147)	1.0
  (2, 149)	3.0
  (2, 150)	26.0
  (2, 153)	7.925
  (3, 0)	1.0
  (3, 56)	1.0
  (3, 149)	1.0
  (3, 150)	35.0
  (3, 151)	1.0
  (3, 153)	53.1
  (4, 1)	1.0
  (4, 147)	1.0
  :	:
  (884, 149)	2.0
  (884, 150)	27.0
  (884, 153)	13.0
  (885, 0)	1.0
  (885, 31)	1.0
  (885, 149)	1.0
  (885, 150)	19.0
  (885, 153)	30.0
  (886, 0)	1.0
  (886, 147)	1.0
  (886, 149)	3.0
  (886, 150)	29.69911764705882
  (886, 151)	1.0
  (886, 152)	2.0
  (886, 153)	23.45
  (887, 1)	1.0
  (887, 61)	1.0
  (887, 149)	1.0
  (887, 150)	26.0
  (887, 153)	30.0
  (888, 1)	1.0
  (888, 14

In [13]:
#Easier way above
#X = dataset.loc[:, ~dataset.columns.isin(['Survived'])].values
#y = dataset.loc[:, dataset.columns.isin(['Survived'])].values

Now, split into test and training sets.

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [20]:
print(X_train)

[[3 'male' 28.0 ... '349207' 7.8958 nan]
 [3 'female' 17.0 ... '3101281' 7.925 nan]
 [3 'male' 30.0 ... 'A/5. 3336' 16.1 nan]
 ...
 [3 'male' nan ... '334912' 7.7333 nan]
 [3 'female' 36.0 ... '345572' 17.4 nan]
 [2 'male' 60.0 ... '29750' 39.0 nan]]


In [23]:
dataset.loc[:, dataset.columns.isin(['Ticket'])]

Unnamed: 0,Ticket
0,A/5 21171
1,PC 17599
2,STON/O2. 3101282
3,113803
4,373450
...,...
886,211536
887,112053
888,W./C. 6607
889,111369
