# Titanic project code

## Import modules

In [16]:
# import packages 
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
%matplotlib inline


# import pre-processing modules s
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel 
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline


# import classifiers
# scale robust
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# scale sensitive
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


# import evaluation modules
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, classification_report
from sklearn.metrics import  RocCurveDisplay, roc_auc_score, auc, roc_curve, PrecisionRecallDisplay, precision_recall_curve


## Define Functions 

In [17]:
# parse title and fam name from df 
def parse_name(df):
    last_name = df['name'].apply(lambda x : x.split(', ')[0])
    temp = df['name'].apply(lambda x : x.split(', ')[1])
    title = temp.apply(lambda x : x.split('.')[0])
    return title, last_name

def cat_to_numeric(df):
    cat_columns = df.select_dtypes(['object']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])
    return df 

def pvalue_filter(target_df, features_df, alpha): # returns a list of columns that are possible drop, p_val > alpha, corr, pval
    features_columns_names = list(features_df)
    target_column_name = list(target_df)
    features_np = features_df.to_numpy()
    target_np = target_df.to_numpy()
    drop_index = []
    p_val_list =[]
    corr_list = []
    
    for i in range(len(features_columns_names)):
        corr, p_val = sp.stats.pearsonr(features_np[:,i], target_np)
        corr_list.append(round(corr,3))
        if p_val > alpha:           # accept the null hypothesis, no statisitcal significance  
            drop_index.append(i)
            p_val_list.append(p_val)
            
    drop_col = [features_columns_names[i] for i in drop_index]
    return drop_col, corr_list, p_val_list

In [18]:
test_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/ignore/test.csv")
train_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/ignore/train.csv")

df_original = pd.concat([train_set,test_set], axis=0, ignore_index=True )
df_train = (train_set.copy()).rename(columns=str.lower)
df_test = (test_set.copy()).rename(columns=str.lower)

df = (df_original.copy()).rename(columns=str.lower)


For those who would like to see the [EDA notebook](https://github.com/yona-av/titanic_kaggle/blob/main/EDA_notebook.ipynb) now

# Deal with age Nans
 this code section deals with Nan values in the age column in few ways:
- Those with the title Master are younger then 15 years, so they get the mean age of those under 15 years.
- Lone passengers. they get the age mean by their gender    
- One companion. I tried to to extract information through checking the companion's information 
- Hand fill values special cases 

In [19]:
age_nan = df[df['age'].isna()]  
print('df age nans before methods',df['age'].isna().sum())
title, last_name = parse_name(df)
df['title'] = title
df['last_name'] = last_name

###### Deal with Masters  ############################################

child_mean =  round(df[df["age"] < 15 ]['age'].mean(numeric_only=True),2)
df.loc[df['title'] == 'Master','age'] = df.loc[df['title'] == 'Master','age'].fillna(child_mean)
age_nan = df[df['age'].isna()] # update age_nan

####### Deal with lone passengers ############################################
male_mean =  round(df[df["sex"] == 'male']['age'].mean(numeric_only=True),2)
female_mean =  round(df[df["sex"] == 'female']['age'].mean(numeric_only=True),2)

df.loc[(df['sibsp'] == 0) & (df['parch'] == 0) & (df['sex'] == 'male'), 'age'
      ]= df.loc[(df['sibsp'] == 0) & (df['parch'] == 0) & (df['sex'] == 'male'), 'age'].fillna(male_mean)

df.loc[(df['sibsp'] == 0) & (df['parch'] == 0) & (df['sex'] == 'female'), 'age'
      ]= df.loc[(df['sibsp'] == 0) & (df['parch'] == 0) & (df['sex'] == 'female'), 'age'].fillna(female_mean)

age_nan = df[df['age'].isna()] # update age_nan

########## ONE companion (sibling or spouse) ############################################################

one_comp_mean = round(df[(df['sibsp'] == 1) & (df['parch'] == 0)]['age'].mean(numeric_only=True),2)
one_comp_df = df[(df['sibsp'] == 1) & (df['parch'] == 0)]
one_comp_df_nan = one_comp_df[one_comp_df['age'].isna()]

for i in range(len(one_comp_df_nan)):
    for j in range(len(one_comp_df)):
        #compare last name and make sure it isn't the same passenger
        if (one_comp_df_nan['last_name'].iloc[i] == one_comp_df['last_name'].iloc[j]  
            and one_comp_df_nan['passengerid'].iloc[i] != one_comp_df['passengerid'].iloc[j]):
            
            ix_nan = one_comp_df_nan['passengerid'].iloc[i] - 1 
            ix = one_comp_df['passengerid'].iloc[j] -1 
            df.at[ix_nan,'age'] = df.at[ix,'age']
            
age_nan = df[df['age'].isna()] # update the age_nan

########## Special cases ############################################################
# sage family
#parents
df.at[1233, 'age'] = male_mean
df.at[1256, 'age'] = female_mean
#kids
df[df['last_name'] == 'Sage']['age'] = df[df['last_name'] == 'Sage']['age'].fillna(child_mean)

# lebfre family
df.at[1023, 'age'] = female_mean
#kids
df[df['last_name'] == 'lebfre']['age'] = df[df['last_name'] == 'lebfre']['age'].fillna(child_mean)

#jhonston family
#parents
df.at[783, 'age'] = male_mean
df.at[924, 'age'] = female_mean

age_nan = df[df['age'].isna()] # update age_nan


############ deal with the rest ########################################################################################
df.loc[df['sex'] == 'male', 'age'] = df.loc[df['sex'] == 'male', 'age'].fillna(male_mean)
df.loc[df['sex'] == 'female', 'age'] = df.loc[df['sex'] == 'female', 'age'].fillna(female_mean)
age_nan = df[df['age'].isna()] # update age_nan
print('df age nans after methods',df['age'].isna().sum())



df age nans before methods 263
df age nans after methods 0


# Deal with cabin Nans
this code section deals with Nan values in the cabin column in two ways:
- By comparing the ticket numbers. If there is a similar ticket number and it has a cabin number, it seems like they share the same cabin.
- Some passengers have the same fare but a different Pclass. The second method assumes that those in the same class who paid a same fare would be in close cabins. Thus, combining the ‘Pclass’ to the ‘fare’ column creates a new sub group. e.g: pclass :2 and fare: 7.25 would create a new cabin 27.25 or 37.25 if the passenger has Pclass 3.


In [20]:
cabin = df[~df['cabin'].isna()]
cabin_nan =  df[df['cabin'].isna()]
print('df cabin nans before methods',df['cabin'].isna().sum())

################## deal with cabin nan by comparing the ticket number #########################################################
for t in range(len(cabin['ticket'])):
    for n in range(len(cabin_nan['ticket'])):
        if cabin['ticket'].iloc[t] == cabin_nan['ticket'].iloc[n] :
           
            ix = cabin['passengerid'].iloc[t] - 1    
            ix_nan = cabin_nan['passengerid'].iloc[n] - 1 
            if ix != ix_nan:
                # update the value of cabin_nan in the df
                df.at[ix_nan,'cabin'] = df.at[ix,'cabin']


cabin_nan =  df[df['cabin'].isna()] # update cabin_nan   

################## deal with rest of Nans #########################################################

# combine for each passenger 'pclass' and 'fare' and assume 
#that if they paid the same and were at the same class they were at the sme cabin

cabin_nan['cabin'] = cabin_nan['pclass'].astype('string') + cabin_nan['fare'].astype('string')
for i in range(len(cabin_nan)):
    ix = cabin_nan['passengerid'].iloc[i] - 1
    df.at[ix,'cabin'] = cabin_nan['cabin'].iloc[i]

cabin_nan =  df[df['cabin'].isna()] # update cabin_nan

########### special case #############################################################################
df.at[1043,'cabin'] = 38.05 # explain 3 is pclass and added to fare value 8.05 
cabin_nan =  df[df['cabin'].isna()] # update cabin_nan
print('df df cabin nans after methods',len(cabin_nan))    

df cabin nans before methods 1014
df df cabin nans after methods 0


# other Nans

There are some special cases in other columns :
- passenger 1044 gets the mean fare of 3rd class 
- passengers 62 and 830 get the most common embarking place 'S'


In [21]:
# fill fare nan as the average of 3 class . 

t_class_mean = round(df[df["pclass"] == 3 ]['fare'].mean(numeric_only=True),2)
df.at[1043, 'fare'] = t_class_mean

# fill nan embarked to be as most common embarking place 
df.at[61, 'embarked'] = 'S'
df.at[829, 'embarked'] = 'S'

survived = df['survived']
passenger_id = df['passengerid'].values



## Decrease the number of categories in the column 'title'

In [22]:

df['title'] = df['title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',
                                   'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df['title'] = df['title'].replace(['Mlle','Ms' ], 'Miss')
df['title'] = df['title'].replace('Mme', 'Mrs')



## Label the data numerically 

In [23]:
df = cat_to_numeric(df)

# check correlation and p-value
This section checks linear correlation between the features to the target. In some cases there might be some nonlinear correlation , but it is still good to check whether there is strong linear correlation between features.
A p-value test is performed to eliminate features that doesn't contribute and have no statistical significance  


In [24]:
corr_mat = df.corr()
corr_mat['survived'].sort_values(ascending=False)

survived       1.000000
sex            0.543351
title          0.405788
fare           0.257307
cabin          0.218673
embarked       0.106811
parch          0.081629
last_name      0.017314
passengerid   -0.005007
name          -0.005007
sibsp         -0.035322
ticket        -0.047298
age           -0.065752
pclass        -0.338481
Name: survived, dtype: float64

In [25]:
df_train = df.iloc[:890,:]

drop_col, corr_list, p_val_list = pvalue_filter(df_train['survived'], df_train.drop(columns=['survived']), 0.05)
print('drop those columns, p-value < 0.05 no statistical importance',drop_col)

drop those columns, p-value < 0.05 no statistical importance ['passengerid', 'name', 'age', 'sibsp', 'ticket', 'last_name']


In [26]:
df = df.drop(columns=['survived','passengerid', 'name', 'sibsp', 'ticket', 'last_name','age']) 

# Create a model 

## scaling

In [27]:

numerical_columns_selector = selector(dtype_exclude=int)
categorical_columns_selector = selector(dtype_include=int)

numerical_columns = numerical_columns_selector(df)
categorical_columns = categorical_columns_selector(df)

In [28]:
print('continious features that are used :', numerical_columns)
print('categorical features that are used :', categorical_columns)

continious features that are used : ['fare']
categorical features that are used : ['pclass', 'sex', 'parch', 'cabin', 'embarked', 'title']


## Data encoding
- One hot encoding for categorical features 
- Standard scaler for continuous features


In [29]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])



## Divide the data frame back to the original train and test sets   

In [30]:
# divide back into train test sets 
df_train = df.iloc[:890,:]
df_test = df.iloc[891:,:]

x_train = df_train
y_train = survived.iloc[:890]


x_test = df_test
y_test = survived.iloc[891:]

# Train few models for comparison 

## SVC 

In [31]:
model = make_pipeline(preprocessor, SVC())
model = model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

svc_acc = np.round(cross_val_score(model, x_train , y_train, cv=5, scoring='accuracy').mean(),6)
print('SVC_accuracy :',svc_acc)

SVC_accuracy : 0.820225


# Random Forest

In [32]:
model = make_pipeline(preprocessor, RandomForestClassifier(random_state=9) )
model = model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

rf_acc = np.round(cross_val_score(model, x_train , y_train, cv=5, scoring='accuracy').mean(),3)
print('RF_accuracy :', rf_acc)

RF_accuracy : 0.827


# Logistic Regression 

In [33]:
model = make_pipeline(preprocessor, LogisticRegression(random_state=9))
model = model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

lg_acc = np.round(cross_val_score(model, x_train , y_train, cv=5, scoring='accuracy').mean(),3)
print('Logistic_Regression_accuracy :', lg_acc)

Logistic_Regression_accuracy : 0.83


# SGD 

In [34]:
model = make_pipeline(preprocessor, SGDClassifier(max_iter=1000, tol=1e-3) )
model = model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

sgd_acc =np.round(cross_val_score(model, x_train , y_train, cv=5, scoring='accuracy').mean(),3)
print('SGD_accuracy :', sgd_acc)

SGD_accuracy : 0.749


# XGBoost

In [35]:
model = make_pipeline(preprocessor, XGBClassifier(n_estimators=100, objective='binary:logistic') )
model = model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

xgb_acc = np.round(cross_val_score(model, x_train , y_train, cv=5, scoring='accuracy').mean(),3)
print('XGB_accuracy :', xgb_acc)

XGB_accuracy : 0.82


# KNN 

In [36]:
model = make_pipeline(preprocessor, KNeighborsClassifier() )
model = model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

knn_acc = np.round(cross_val_score(model, x_train , y_train, cv=5, scoring='accuracy').mean(),3)
print('KNN_accuracy :', knn_acc)


KNN_accuracy : 0.804


In [37]:
models = pd.DataFrame({
    'Train score':[svc_acc, rf_acc, lg_acc, 
              sgd_acc, xgb_acc, knn_acc],
    'Kaggle score':[0.801, 0.77, 0.787, 0.791, 0.751, 0.765 ],
    
    'Model':['SVC', 'Random Forest', 'Logistic Regression', 
              'SGD', 'XGBoost', 'KNN']})

pd.pivot_table(models, index='Model', values=['Train score','Kaggle score']).sort_values(by='Kaggle score', ascending=False)


Unnamed: 0_level_0,Kaggle score,Train score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
SVC,0.801,0.820225
SGD,0.791,0.749
Logistic Regression,0.787,0.83
Random Forest,0.77,0.827
KNN,0.765,0.804
XGBoost,0.751,0.82


As it seen from the comparison of different results of the models. SVC performance was the best - gave the highest Kaggle accuracy score. So now we retrain the model with SVC and check if hyperparameters tuning, using Grid search will improve the accuracy. 

In [38]:
y_pred = cross_val_predict(model, x_train, y_train, cv=3)
cnf_mx = confusion_matrix(y_train, y_pred)
print(cnf_mx)
print(classification_report(y_train, y_pred))  

[[481  67]
 [111 231]]
              precision    recall  f1-score   support

         0.0       0.81      0.88      0.84       548
         1.0       0.78      0.68      0.72       342

    accuracy                           0.80       890
   macro avg       0.79      0.78      0.78       890
weighted avg       0.80      0.80      0.80       890



In [None]:
param_grid = [{'svc__C': [0.1,0.5, 1, 10,50,100,200,500],
               'svc__kernel':['rbf','linear','poly'],
               'svc__gamma': ['auto', 'scale'],
               'svc__decision_function_shape': ['ovo', 'ovr']}]


gd = GridSearchCV(model, param_grid=param_grid, verbose=True)
gd.fit(x_train, y_train)

print('best score after Gridsearch',gd.best_score_) 

print(gd.best_estimator_)


## retrain the model with the new parameters
After the hyperparameters tuning the accuracy on kaggle is 0.79425  

In [39]:
model = make_pipeline(preprocessor, SVC(C=500, decision_function_shape='ovo', gamma='auto'))
model = model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

svc_acc = np.round(cross_val_score(model, x_train , y_train, cv=5, scoring='accuracy').mean(),6)
print('SVC_accuracy :',svc_acc)

SVC_accuracy : 0.848315


In [40]:
y_pred = cross_val_predict(model, x_train, y_train, cv=3)
cnf_mx = confusion_matrix(y_train, y_pred)
print(cnf_mx)
print(classification_report(y_train, y_pred)) 

[[488  60]
 [ 85 257]]
              precision    recall  f1-score   support

         0.0       0.85      0.89      0.87       548
         1.0       0.81      0.75      0.78       342

    accuracy                           0.84       890
   macro avg       0.83      0.82      0.83       890
weighted avg       0.84      0.84      0.84       890



Even though the accuracy on the cross-validation folds was ~ 0.849 and the other metrics performed better, the accuracy on kaggle gave a slightly worse accuracy - 0.79425.
So, I retrained the model with SVC with the default parameters.


# Prediction

In [41]:
y_test_pred = model.predict(x_test)
y_test_pred = y_test_pred.astype(int)


submission_df = pd.DataFrame({'Passengerid': passenger_id[891:], 'Survived': y_test_pred})
submission_df.to_csv('titanic_prediction.csv', index = False)
