In [114]:
# import packages 
import numpy as np 
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt
import os 
import sys
import seaborn as sns
import scipy as sp
import scipy.stats as stats
from scipy.stats import norm
import xgboost
%matplotlib inline


# import pre-processing modules 
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel 
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline


# import classifiers
# scale robust
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# scale sensitive
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

# import evaluation modules
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, classification_report
from sklearn.metrics import  RocCurveDisplay, roc_auc_score, auc, roc_curve, PrecisionRecallDisplay, precision_recall_curve

In [115]:
test_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/ignore/test.csv")
train_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/ignore/train.csv")

df_original = pd.concat([train_set,test_set], axis=0, ignore_index=True )
df_train = (train_set.copy()).rename(columns=str.lower)
df_test = (test_set.copy()).rename(columns=str.lower)

df = (df_original.copy()).rename(columns=str.lower)


In [116]:
# parse title and fam name from df 
def parse_name(df):
    last_name = df['name'].apply(lambda x : x.split(', ')[0])
    temp = df['name'].apply(lambda x : x.split(', ')[1])
    title = temp.apply(lambda x : x.split('.')[0])
    return title, last_name

def cat_to_numeric(df):
    cat_columns = df.select_dtypes(['object']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])
    return df 

def pvalue_filter(target_df, features_df, alpha): # returns a list of columns that are possible drop, p_val > alpha, corr, pval
    features_columns_names = list(features_df)
    target_column_name = list(target_df)
    features_np = features_df.to_numpy()
    target_np = target_df.to_numpy()
    drop_index = []
    p_val_list =[]
    corr_list = []
    
    for i in range(len(features_columns_names)):
        corr, p_val = sp.stats.pearsonr(features_np[:,i], target_np)
        corr_list.append(round(corr,3))
        if p_val > alpha:           # accept the null hypothesis, no statisitcal significance  
            drop_index.append(i)
            p_val_list.append(p_val)
            
    drop_col = [features_columns_names[i] for i in drop_index]
    return drop_col, corr_list, p_val_list

# deal with age Nans

In [117]:
age_nan = df[df['age'].isna()]  
print('df  age nans',df['age'].isna().sum())
title, last_name = parse_name(df)
df['title'] = title
df['last_name'] = last_name

###### Deal with Masters  ############################################

child_mean =  round(df[df["age"] < 15 ]['age'].mean(numeric_only=True),2)
df.loc[df['title'] == 'Master','age'] = df.loc[df['title'] == 'Master','age'].fillna(child_mean)
age_nan = df[df['age'].isna()] # update age_nan
print('df  age nans',df['age'].isna().sum())

####### Deal with lone passengers ############################################
male_mean =  round(df[df["sex"] == 'male']['age'].mean(numeric_only=True),2)
female_mean =  round(df[df["sex"] == 'female']['age'].mean(numeric_only=True),2)

df.loc[(df['sibsp'] == 0) & (df['parch'] == 0) & (df['sex'] == 'male'), 'age'
      ]= df.loc[(df['sibsp'] == 0) & (df['parch'] == 0) & (df['sex'] == 'male'), 'age'].fillna(male_mean)

df.loc[(df['sibsp'] == 0) & (df['parch'] == 0) & (df['sex'] == 'female'), 'age'
      ]= df.loc[(df['sibsp'] == 0) & (df['parch'] == 0) & (df['sex'] == 'female'), 'age'].fillna(female_mean)

age_nan = df[df['age'].isna()] # update age_nan
print('df  age nans',df['age'].isna().sum())

########## ONE companion (sibling or spouse) ############################################################

one_comp_mean = round(df[(df['sibsp'] == 1) & (df['parch'] == 0)]['age'].mean(numeric_only=True),2)
one_comp_df = df[(df['sibsp'] == 1) & (df['parch'] == 0)]
one_comp_df_nan = one_comp_df[one_comp_df['age'].isna()]

for i in range(len(one_comp_df_nan)):
    for j in range(len(one_comp_df)):
        #compare last name and make sure it isn't the same passenger
        if (one_comp_df_nan['last_name'].iloc[i] == one_comp_df['last_name'].iloc[j]  
            and one_comp_df_nan['passengerid'].iloc[i] != one_comp_df['passengerid'].iloc[j]):
            
            ix_nan = one_comp_df_nan['passengerid'].iloc[i] - 1 
            ix = one_comp_df['passengerid'].iloc[j] -1 
            df.at[ix_nan,'age'] = df.at[ix,'age']
            
age_nan = df[df['age'].isna()] # update the age_nan
print('df  age nans',df['age'].isna().sum())

########## Special cases ############################################################
# sage family
#parents
df.at[1233, 'age'] = male_mean
df.at[1256, 'age'] = female_mean
#kids
df[df['last_name'] == 'Sage']['age'] = df[df['last_name'] == 'Sage']['age'].fillna(child_mean)

# lebfre family
df.at[1023, 'age'] = female_mean
#kids
df[df['last_name'] == 'lebfre']['age'] = df[df['last_name'] == 'lebfre']['age'].fillna(child_mean)

#jhonston family
#parents
df.at[783, 'age'] = male_mean
df.at[924, 'age'] = female_mean

age_nan = df[df['age'].isna()] # update age_nan
print('df  age nans',df['age'].isna().sum())

############ deal with the rest ########################################################################################
df.loc[df['sex'] == 'male', 'age'] = df.loc[df['sex'] == 'male', 'age'].fillna(male_mean)
df.loc[df['sex'] == 'female', 'age'] = df.loc[df['sex'] == 'female', 'age'].fillna(female_mean)
age_nan = df[df['age'].isna()] # update age_nan
print('df  age nans',df['age'].isna().sum())



df  age nans 263
df  age nans 255
df  age nans 56
df  age nans 49
df  age nans 44
df  age nans 0


# deal with cabin Nans

In [118]:
cabin = df[~df['cabin'].isna()]
cabin_nan =  df[df['cabin'].isna()]
print('df  cabin nans',df['cabin'].isna().sum())

################## deal with cabin nan by comparing the ticket number #########################################################
for t in range(len(cabin['ticket'])):
    for n in range(len(cabin_nan['ticket'])):
        if cabin['ticket'].iloc[t] == cabin_nan['ticket'].iloc[n] :
           
            ix = cabin['passengerid'].iloc[t] - 1    
            ix_nan = cabin_nan['passengerid'].iloc[n] - 1 
            if ix != ix_nan:
                # update the value of cabin_nan in the df
                df.at[ix_nan,'cabin'] = df.at[ix,'cabin']


cabin_nan =  df[df['cabin'].isna()] # update cabin_nan
print('df cabin nans updated',len(cabin_nan))    

################## deal with rest of Nans #########################################################

# combine for each passenger 'pclass' and 'fare' and assume 
#that if they paid the same and were at the same class they were at the sme cabin

cabin_nan['cabin'] = cabin_nan['pclass'].astype('string') + cabin_nan['fare'].astype('string')
for i in range(len(cabin_nan)):
    ix = cabin_nan['passengerid'].iloc[i] - 1
    df.at[ix,'cabin'] = cabin_nan['cabin'].iloc[i]

cabin_nan =  df[df['cabin'].isna()] # update cabin_nan
print('df cabin nans updated',len(cabin_nan))   

########### special case #############################################################################
df.at[1043,'cabin'] = 38.05 # explain 3 is pclass and added to fare value 8.05 
cabin_nan =  df[df['cabin'].isna()] # update cabin_nan
print('df cabin nans updated',len(cabin_nan))    

df  cabin nans 1014
df cabin nans updated 998
df cabin nans updated 1
df cabin nans updated 0


# other Nans

In [119]:
# fill fare nan as the average of 3 class . 

t_class_mean = round(df[df["pclass"] == 3 ]['fare'].mean(numeric_only=True),2)
df.at[1043, 'fare'] = t_class_mean

# fill nan embarked to be as most common embarking place 
df.at[61, 'embarked'] = 'S'
df.at[829, 'embarked'] = 'S'

survived = df['survived']
passenger_id = df['passengerid'].values

df = cat_to_numeric(df)

# check correlation and p-value

In [120]:
corr_mat = df.corr()
corr_mat['survived'].sort_values(ascending=False)

survived       1.000000
sex            0.543351
title          0.290260
fare           0.257307
cabin          0.218673
embarked       0.106811
parch          0.081629
last_name      0.017314
passengerid   -0.005007
name          -0.005007
sibsp         -0.035322
ticket        -0.047298
age           -0.065752
pclass        -0.338481
Name: survived, dtype: float64

In [121]:
df_train = df.iloc[:890,:]

drop_col, corr_list, p_val_list = pvalue_filter(df_train['survived'], df_train.drop(columns=['survived']), 0.05)
print('drop those columns p-value < 0.005',drop_col)

drop those columns p-value < 0.005 ['passengerid', 'name', 'age', 'sibsp', 'ticket', 'last_name']


In [122]:
df = df.drop(columns=['survived','passengerid', 'name', 'sibsp', 'ticket', 'last_name']) 
# 'name', 'sibsp', 'ticket' dropped those columns logic in unstandradized models file p value filter  

# scaling

In [123]:

numerical_columns_selector = selector(dtype_exclude=int)
categorical_columns_selector = selector(dtype_include=int)

numerical_columns = numerical_columns_selector(df)
categorical_columns = categorical_columns_selector(df)

In [124]:
print(numerical_columns)
print(categorical_columns)

['age', 'fare']
['pclass', 'sex', 'parch', 'cabin', 'embarked', 'title']


In [125]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

model = make_pipeline(preprocessor, SVC(kernel='rbf') )

In [126]:
# divide back into train test sets 
df_train = df.iloc[:890,:]
df_test = df.iloc[891:,:]

x_train = df_train
y_train = survived.iloc[:890]


x_test = df_test
y_test = survived.iloc[891:]

In [127]:
model = model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

In [128]:
cv_5_accuracy = cross_val_score(model, x_train , y_train, cv=5, scoring='accuracy')
cv_5_precision = cross_val_score(model, x_train , y_train, cv=5, scoring='precision')
cv_5_recall = cross_val_score(model, x_train , y_train, cv=5, scoring='recall')
cv_5_f1 = cross_val_score(model, x_train , y_train, cv=5, scoring='f1')

print('cv_avg_accuracy :', np.round(cv_5_accuracy.mean(),3))
print('cv_avg_precision', np.round(cv_5_precision.mean(),3))
print('cv_avg_recall', np.round(cv_5_recall.mean(),3))
print('cv_avg_fscore :', np.round(cv_5_f1.mean(),3))

cv_avg_accuracy : 0.818
cv_avg_precision 0.827
cv_avg_recall 0.667
cv_avg_fscore : 0.737


In [129]:
y_pred = cross_val_predict(model, x_train, y_train, cv=3)
cnf_mx = confusion_matrix(y_train, y_pred)
print(cnf_mx)
print(classification_report(y_train, y_pred))  

[[494  54]
 [114 228]]
              precision    recall  f1-score   support

         0.0       0.81      0.90      0.85       548
         1.0       0.81      0.67      0.73       342

    accuracy                           0.81       890
   macro avg       0.81      0.78      0.79       890
weighted avg       0.81      0.81      0.81       890



In [130]:
y_test_pred = model.predict(x_test)
y_test_pred = y_test_pred.astype(int)


submission_df = pd.DataFrame({'Passengerid': passenger_id[891:], 'Survived': y_test_pred})
submission_df.to_csv('titanic_pred_40.csv', index = False)
