In [1]:
#get the data
import os
os.chdir("Location")

import pandas as pd
titanic_train = pd.read_csv("train.csv")
titanic_test = pd.read_csv("test.csv")

In [2]:
#drop ticket, passengerid, name, and cabin column from both sets.
#These are not useful fields. The first three are completely nominal rather than categorical.
    #The fourth might have some categorical element in it, but I couldn't find one.
#omit the rows where a column is missing only a value or two
train_set = titanic_train.drop(['Ticket','PassengerId','Name', 'Cabin'], axis = 1).dropna(subset=['Embarked'])
test_set = titanic_test.drop(['Ticket','PassengerId','Name', 'Cabin'], axis = 1)
#Embarked IS categorical, so, in theory I could create an additinal value of "missing",
    #but I don't see the point because there are no "missing" values in that column in the test set

In [3]:
#Now split train into train and val before doing any imputation
from sklearn.model_selection import train_test_split
train_set, validation_set = train_test_split(train_set, test_size=.2, random_state=42)
train_features = train_set.drop(['Survived'], axis = 1)
train_values = train_set['Survived']
validation_features = validation_set.drop(['Survived'], axis = 1)
validation_values = validation_set['Survived']
test_features = test_set.copy()

In [4]:
#Which features are numerical and which are categorical?
train_features_num = list(train_features.drop(["Pclass", 'Embarked', 'Sex'], axis = 1))
train_features_cat = list(train_features[['Pclass', 'Embarked', 'Sex']])

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

cat_pipe = ColumnTransformer([
    ('onehot', OneHotEncoder(), train_features_cat)], remainder = 'passthrough')

train_processed = cat_pipe.fit_transform(train_features)
#When I do it this way then I lose everything else

#The structure for train_processed is now
    #pclass for 3, embarked for 3 more, sex for 2, age for 1, sibsp for 1, parch for 1, fare for 1 makes 12
#print(type(train_processed)) #ndarray that I can put into the gridsearch
#Still gotta impute age

In [6]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
#Need RandomForestClassifier (not Regressor)
from sklearn.model_selection import GridSearchCV

# Simple pipeline which has an imputer followed by regressor
pipe = Pipeline(steps=[('impute', SimpleImputer(missing_values=np.nan)),
                       ('regressor', RandomForestClassifier())])

# 3 different imputers
param_grid = {
        'impute__strategy': ["mean", "median", "most_frequent"]
        }

# Run girdsearch
search = GridSearchCV(pipe, param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 4, n_jobs = 6)
search.fit(train_processed, train_values)

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   8 out of  15 | elapsed:    7.0s remaining:    6.1s
[Parallel(n_jobs=6)]: Done  12 out of  15 | elapsed:    7.2s remaining:    1.7s
[Parallel(n_jobs=6)]: Done  15 out of  15 | elapsed:    7.3s finished


Best parameter (CV score=-0.177):
{'impute__strategy': 'mean'}


In [7]:
pred_table_train = pd.crosstab(train_values, search.predict(train_processed))

In [8]:
print(pred_table_train.to_numpy())

[[438   2]
 [  8 263]]


In [9]:
print(np.diagonal(pred_table_train).sum()/np.sum(np.sum(pred_table_train)))
#Over 98 percent That's pretty good. I'd prefer 99, but I'll take what I'm getting.

0.9859353023909986


In [10]:
#But all of that's just on the training data.
#I need to run it on the validation set to get an idea of how it performs on
#unseen data
validation_features_num = list(validation_features.drop(["Pclass", 'Embarked', 'Sex'], axis = 1))
validation_features_cat = list(validation_features[['Pclass', 'Embarked', 'Sex']])

validation_processed = cat_pipe.fit_transform(validation_features)

# Run girdsearch
search = GridSearchCV(pipe, param_grid, cv = 5, scoring = 'neg_mean_squared_error', verbose = 4, n_jobs = 6)
search.fit(validation_processed, validation_values)

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

pred_table_validation = pd.crosstab(validation_values, search.predict(validation_processed))
print(pred_table_validation.to_numpy())
print(np.diagonal(pred_table_validation).sum()/np.sum(np.sum(pred_table_validation)))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   8 out of  15 | elapsed:    0.8s remaining:    0.7s
[Parallel(n_jobs=6)]: Done  12 out of  15 | elapsed:    0.9s remaining:    0.1s
[Parallel(n_jobs=6)]: Done  15 out of  15 | elapsed:    1.3s finished


Best parameter (CV score=-0.236):
{'impute__strategy': 'mean'}
[[108   1]
 [  0  69]]
0.9943820224719101


In [11]:
#99.44% is better than 98.45%
#So what does Random Forest predict?
#Test predictios
test_processed = cat_pipe.fit_transform(test_features)
test_predictions = search.predict(test_processed)
print(test_predictions)
print(len(test_predictions))

[0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 0 1 0 0 0 1
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 1 0 0 1 0
 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 1 0 1 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 1 1 1 0 0 1 0
 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 0 1 1 0 0 1 0 0 0]
418


In [20]:
#Interesting. I thought that preprocessing on the train set mixed up the order.
to_submit = np.stack((titanic_test['PassengerId'],test_predictions), axis = 1)
print(to_submit.shape)
print(to_submit[:7])

(418, 2)
[[892   0]
 [893   0]
 [894   0]
 [895   1]
 [896   0]
 [897   0]
 [898   1]]


In [None]:
#Now write it out to a CSV
import os
print(os.getcwd())

import csv

header = ['PassengerId', 'Survived']

with open('Titanic_Predictions_H.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(to_submit)

In [None]:
#Kaggle Results

'''
43158
THILL3
0.74162
'''
#74% could use some work

In [None]:
#Suggestions
'''
Deal with numeric outliers
Consider catgorizing some numeric variables
Ticket prefix might helpUse peoples' titles (Mr, Mrs, etc.)
XGBoost
'''