# Predicting Survival on the Titanic (Kaggle Competition)
[Link to Competition on Kaggle]('https://www.kaggle.com/competitions/titanic')

Timothy Miller II

## Basics
* Set working directory
* Import basic packages
* Upload Test and Train Sets

In [2]:
# set working directory
import os

os.chdir('/Users/timothymiller/Documents/GitHub/Titanic')

In [3]:
# import numpy and pandas
import numpy as np
import pandas as pd

# import data 
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
# train data view
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# test data view
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Preprocessing

For our ML models, we will need to change the string data into numerical dummy variables to allow for the computer to analyze.

In [6]:
# import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# set LabelEncoder object
le = LabelEncoder()

# specify the columns that we need to encode 
cols_to_encode = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

# create duplicate 'tidy' objects for train / test data
train_tidy = train
test_tidy = test

# encode columns to numerical data via fit_transform function
train_tidy[cols_to_encode] = train_tidy[cols_to_encode].apply(lambda col: le.fit_transform(col))
test_tidy[cols_to_encode] = test_tidy[cols_to_encode].apply(lambda col: le.fit_transform(col))


In [7]:
# check for missing values in training data
train_tidy.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [8]:
# replacing missing age values with mean age values in train
train_mean_age = train_tidy['Age'].mean() 
train_tidy['Age'].fillna(train_mean_age, inplace=True)


In [9]:
# check for missing values in the testing data
test_tidy.isnull().sum()

PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Cabin           0
Embarked        0
dtype: int64

In [10]:
# repalcing missing age and fare data with averages for respective column

test_mean_age = test_tidy['Age'].mean()
test_mean_fare = test_tidy['Fare'].mean()
test_tidy['Age'].fillna(test_mean_age, inplace=True)
test_tidy['Fare'].fillna(test_mean_fare, inplace=True)



In [11]:
# check to ensure there are no null values remaining
check1 = test_tidy.isnull().sum().sum() == 0
check2 = train_tidy.isnull().sum().sum() == 0
print(check1,check2) # should return two true vals

True True


In [12]:
# separate dependent and independent vars
X_train = train_tidy.drop('Survived', axis=1)
y_train = train_tidy['Survived']
X_test = test_tidy
# y_test is unknown

In [13]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,108,1,22.000000,1,0,523,7.2500,147,2
1,2,1,190,0,38.000000,1,0,596,71.2833,81,0
2,3,3,353,0,26.000000,0,0,669,7.9250,147,2
3,4,1,272,0,35.000000,1,0,49,53.1000,55,2
4,5,3,15,1,35.000000,0,0,472,8.0500,147,2
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,548,1,27.000000,0,0,101,13.0000,147,2
887,888,1,303,0,19.000000,0,0,14,30.0000,30,2
888,889,3,413,0,29.699118,1,2,675,23.4500,147,2
889,890,1,81,1,26.000000,0,0,8,30.0000,60,0


In [14]:
y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

## Logit (Logistic Regression with Summary Stats)

In [27]:
import statsmodels.api as sm

logit = sm.Logit(y_train, X_train)
result = logit.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.481687
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               Survived   No. Observations:                  891
Model:                          Logit   Df Residuals:                      880
Method:                           MLE   Df Model:                           10
Date:                Tue, 04 Apr 2023   Pseudo R-squ.:                  0.2767
Time:                        14:52:19   Log-Likelihood:                -429.18
converged:                       True   LL-Null:                       -593.33
Covariance Type:            nonrobust   LLR p-value:                 1.600e-64
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
PassengerId     0.0008      0.000      2.445      0.014       0.000       0.001
Pclass         -0.3846    

## Logistic Regression

In [41]:
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression model
clf = LogisticRegression()

# train the model on the training data
clf.fit(X_train,y_train)

# create predictions
pred_lr = clf.predict(X_test)

# create a dataframe of predictions
pred_lr_df = pd.DataFrame(pred_lr, columns=['Survived'])

# include passenger id's
ids = test_tidy[['PassengerId']]
pred_lr_df = pd.concat([ids, pred_lr_df], axis=1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
# ensure there are 418 predictions
pred_lr_df.shape

(418, 2)

In [43]:
# import predictions to csv to submit
pred_lr_df.to_csv('predictions.csv', index=False)

## MLPClassifier Neural Net

In [44]:
from sklearn.neural_network import MLPClassifier
# initialize MLPClassifer Model
mlp = MLPClassifier(hidden_layer_sizes=(10000,5000), max_iter=500)
mlp.fit(X_train, y_train)

# create object to store predictions of our MLPClassifier
pred = mlp.predict(X_test)

# create a dataframe of predictions
pred_df = pd.DataFrame(pred, columns=['Survived'])

# include passenger id's
ids = test_tidy[['PassengerId']]
pred_df = pd.concat([ids, pred_df], axis=1)



In [None]:
# ensure there are 418 values
pred_df.shape

(418, 2)

In [None]:
# import predictions to csv for submission
pred_df.to_csv('predictions.csv', index=False)



## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# create gradient boosting classifier model
gb = GradientBoostingClassifier(n_estimators=10000000, learning_rate=0.05, max_depth=5, random_state=2)

# fit model onto training data
gb.fit(X_train,y_train)

# create predictions
pred_gb = gb.predict(X_test)


# create a dataframe of predictions
pred_gb_df = pd.DataFrame(pred_gb, columns=['Survived'])

# include passenger id's
ids = test_tidy[['PassengerId']]
pred_gb_df = pd.concat([ids, pred_gb_df], axis=1)

In [None]:
# ensure there are 418 predictions
pred_gb.shape

(418,)

In [None]:
# import predictions to csv for submission
pred_gb_df.to_csv('predictions.csv', index=False)

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# create decision tree classifier model
dt = DecisionTreeClassifier(
    criterion='entropy', max_depth=None, min_samples_split=10, min_samples_leaf=2,
    max_features=None, random_state=2
    )

# fit decision tree model to training data
dt.fit(X_train, y_train)

# create predictions
pred_dt = dt.predict(X_test)

# create a dataframe of predictions
pred_dt_df = pd.DataFrame(pred_dt, columns=['Survived'])

# include passenger id's
ids = test_tidy[['PassengerId']]
pred_dt_df = pd.concat([ids, pred_dt_df], axis=1)


In [None]:
# ensure there are 418 predictions
pred_dt_df.shape

(418, 2)

In [None]:
# import predictions to csv for submission
pred_dt_df.to_csv('predictions.csv', index=False)

## XG boost model

In [3]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(use_label_encoder=False, objective="binary:logistic", eval_metric="mlogloss")

xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_test)

# create a dataframe of predictions
xgb_pred_df = pd.DataFrame(xgb_pred, columns=['Survived'])

# include passenger id's
ids = test_tidy[['PassengerId']]
xgb_pred_df = pd.concat([ids, xgb_pred_df], axis=1)



NameError: name 'X_train' is not defined

In [55]:
# ensure 418 preds
xgb_pred_df.shape

(418, 2)

In [56]:
# import predictions to csv for submission
xgb_pred_df.to_csv('predictions.csv', index=False)

## KNeighborsClassifier

In [17]:
from sklearn.neighbors import KNeighborsClassifier

# Hyperparameters
n_neighbors = 5 # number of nearest neighbors to consider
weights = 'uniform' # can be 'uniform' or 'distance'
algorithm = 'auto' # will try to choose most appropriate algorithm for us
p=2 # 1 = Manhattan distance, 2 = Euclidean distance

# KNN model
knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm='auto', p=2)

# fit model to the training data
knn.fit(X_train, y_train)

# create predictions
knn_pred = knn.predict(X_test)

# create a dataframe of predictions
knn_pred_df = pd.DataFrame(knn_pred, columns=['Survived'])

# include passenger id's
ids = test_tidy[['PassengerId']]
knn_pred_df = pd.concat([ids, knn_pred_df], axis=1)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [18]:
# make sure 418 predictions in data
knn_pred_df.shape

(418, 2)

In [19]:
# import predictions to csv for submission
knn_pred_df.to_csv('predictions.csv', index=False)

## Support Vector Machines

In [20]:
from sklearn.svm import SVC

# Hyperparameters
kernel = 'rbf'
C = 1.0
gamma = 'auto'

# svm model
svm = SVC(kernel=kernel, C=C, gamma=gamma)

# fit model to training data
svm.fit(X_train, y_train)

# create predictions
svm_pred = svm.predict(X_test)

# create a dataframe of predictions
svm_pred_df = pd.DataFrame(svm_pred, columns=['Survived'])

# include passenger id's
ids = test_tidy[['PassengerId']]
svm_pred_df = pd.concat([ids, svm_pred_df], axis=1)


In [21]:
# make sure 418 predictions in data
svm_pred_df.shape

(418, 2)

In [22]:
# import predictions to csv for submission
knn_pred_df.to_csv('predictions.csv', index=False)

## Random Forest Classifier

In [16]:
# first we will find the weights for each class

from collections import Counter

# count number of samples per class
class_counts = Counter(y_train)

# Calculate the total number of samples
n_samples = sum(class_counts.values())

# Compute the class weights
class_weight = {cls: n_samples / (len(class_counts) * count) for cls, count in class_counts.items()}

print("Class Weights:", class_weight)

Class Weights: {0: 0.8114754098360656, 1: 1.3026315789473684}


In [17]:
from sklearn.ensemble import RandomForestClassifier


# Hyperparameters
n_estimators = 100000
max_depth = 2
random_state = 2
class_weight = class_weight
# create model
rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state, class_weight=class_weight)

# fit model to training data
rf.fit(X_train, y_train)

# create predictions
rf_pred = rf.predict(X_test)

# create a dataframe of predictions
rf_pred_df = pd.DataFrame(rf_pred, columns=['Survived'])

# include passenger id's
ids = test_tidy[['PassengerId']]
rf_pred_df = pd.concat([ids, rf_pred_df], axis=1)

In [18]:
# make sure 418 predictions in data
rf_pred_df.shape

(418, 2)

In [19]:
# import predictions to csv for submission
rf_pred_df.to_csv('predictions.csv', index=False)

# Current Score

In [None]:
# insert highest guessing percent from kaggle
highest_perc = .77033
# calculate correct number of passengers guessed
correct = round(highest_perc*418)
print(f"Correctly guessed {correct} out 418 passengers")

Correctly guessed 322 out 418 passengers
