In [100]:
# import packages 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os 
import sys
import seaborn as sns
import scipy as sp
import scipy.stats as stats
from scipy.stats import norm
import xgboost
%matplotlib inline


# import pre-processing modules 
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel 
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline


# import classifiers
# scale robust
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# scale sensitive
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

# import evaluation modules
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

In [101]:
test_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/ignore/test.csv")
train_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/ignore/train.csv")
df_original = pd.concat([train_set,test_set], axis=0, ignore_index=True )

df_train = (train_set.copy()).rename(columns=str.lower)
df_test = (test_set.copy()).rename(columns=str.lower)

df = (df_original.copy()).rename(columns=str.lower)


In [102]:
# parse title and fam name from df 
def parse_name(df):
    last_name = df['name'].apply(lambda x : x.split(', ')[0])
    temp = df['name'].apply(lambda x : x.split(', ')[1])
    title = temp.apply(lambda x : x.split('.')[0])
    return title, last_name

def cat_to_numeric(df):
    cat_columns = df.select_dtypes(['object']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])
    return df 

def pvalue_filter(target_df, features_df, alpha): # returns a list of columns that are possible drop, p_val > alpha, corr, pval
    features_columns_names = list(features_df)
    target_column_name = list(target_df)
    features_np = features_df.to_numpy()
    target_np = target_df.to_numpy()
    drop_index = []
    p_val_list =[]
    corr_list = []
    
    for i in range(len(features_columns_names)):
        corr, p_val = sp.stats.pearsonr(features_np[:,i], target_np)
        corr_list.append(round(corr,3))
        if p_val > alpha:           # accept the null hypothesis, no statisitcal significance  
            drop_index.append(i)
            p_val_list.append(p_val)
            
    drop_col = [features_columns_names[i] for i in drop_index]
    return drop_col, corr_list, p_val_list

In [103]:
# work on df nan and then concat with original df_train and sort by ID
age_nan = df[df['age'].isna()]
#age_nan.info()
#age_nan = age_nan.sort_values(by=['sex'],ascending=True)
#age_nan = age_nan.sort_values(by=['sibsp'],ascending=False).reset_index(drop=True)
print('df  age nans',df['age'].isna().sum())



df  age nans 263


## deal with age nans

In [104]:
###### Deal with masters  ############################################
title, last_name = parse_name(age_nan)
title = title[title == 'Master']
master_index = title.index.to_numpy()

df_young = df[df["age"] < 15 ]
child_mean = round(df_young['age'].mean(),2)
       
for i in master_index:
    df.at[i,'age'] = child_mean 

age_nan = df[df['age'].isna()] # update the age_nan

####### DEAL  with lone passengers ############################################

l_p_t = age_nan[age_nan['sibsp'] == 0]
lone_pass = l_p_t[l_p_t['parch'] == 0] # passengers who travel alone and cant determine their age by relatives
#lone_pass

# calc mean by gender
df_male = df[df['sex'] == 'male']
df_female = df[df['sex'] == 'female']
male_mean = round(df_male['age'].mean(),2)
female_mean = round(df_female['age'].mean(),2)

lone_pass_male = lone_pass[lone_pass['sex'] == 'male']
lone_pass_female = lone_pass[lone_pass['sex'] == 'female']

l_p_m_index = (lone_pass_male.index).to_numpy()
l_p_f_index = (lone_pass_female.index).to_numpy()

for m in l_p_m_index:
    df.at[m,'age'] = male_mean

for f in l_p_f_index:
    df.at[f,'age'] = female_mean
    
age_nan = df[df['age'].isna()] # update the age_nan



########## ONE companion ############################################################
# one companion in nan df # assign the age of the companion tot the nan 
one_comp_nan = age_nan[age_nan['parch'] == 0] 
one_comp_nan = one_comp_nan[one_comp_nan['sibsp'] == 1]
title_ocn, last_name_ocn = parse_name(one_comp_nan)
ocn_index = (title_ocn.index).to_numpy()


one_comp = df[df['parch'] == 0] # one companion in general df may contain istself. but some companions are both nan in age
one_comp = one_comp[one_comp['sibsp'] == 1]
#one_comp = one_comp[~one_comp['age'].isna()]
title_oc, last_name_oc = parse_name(one_comp)
oc_index = (title_oc.index).to_numpy()

for l in range(len(one_comp_nan)):
    for i in range(len(one_comp)):
        
        if (list(last_name_ocn)[l] == list(last_name_oc)[i] and ocn_index[l] != oc_index[i]): #ensures same last name and not same person
        
            if df.at[oc_index[i],'age'] != float('nan'):
                df.at[ocn_index[l],'age'] =  df.at[oc_index[i],'age'] 
            if list(title_ocn)[l] == 'Mrs' : # she and her husbnad are adults and get the adult average
                df.at[ocn_index[l],'age'] = female_mean
                df.at[oc_index[i],'age'] = male_mean
            if list(title_ocn)[l] == 'Miss': # she is a miss and I assume that travels with a sibling so get young average
                df.at[ocn_index[l],'age'] = child_mean
                df.at[oc_index[i],'age'] = child_mean
            if list(title_ocn)[l] == 'Mr' and list(title_oc)[i] != 'Mrs':
                df.at[ocn_index[l],'age'] = child_mean
                df.at[oc_index[i],'age'] = child_mean


age_nan = df[df['age'].isna()] # update the age_nan

# notice there are only two famillies with 3 siblings travelling together I assume they are all young <15.
three_siblings = age_nan[age_nan['parch'] == 0] 
three_siblings = three_siblings[three_siblings['sibsp'] == 2]
index_temp = three_siblings.index.to_numpy()
for i in index_temp:
    df.at[i,'age'] = child_mean

age_nan = df[df['age'].isna()] # update the age_nan

########### Hand fill age nans #######################################################


# hand fill, thought it will be quicker than thinking about an algo  
#parents
df.at[1233, 'age'] = male_mean
df.at[1256, 'age'] = female_mean
#kids
sages = df[df['ticket'] == 'CA. 2343']
sages = sages[sages['age'].isna()]

index_temp = sages.index.to_numpy()
for i in index_temp:
    df.at[i,'age'] = child_mean
age_nan = df[df['age'].isna()] # update the age_nan

df.at[1023, 'age'] = female_mean
lebfre = df[df['ticket'] == '4133']
lebfre = lebfre[lebfre['age'].isna()]
index_temp = lebfre.index.to_numpy()
for i in index_temp:
    df.at[i,'age'] = child_mean
age_nan = df[df['age'].isna()] # update the age_nan

#jhonston
#parents
df.at[783, 'age'] = male_mean
df.at[924, 'age'] = female_mean
df.at[888,'age'] = child_mean
age_nan = df[df['age'].isna()] # update the age_nan

df.at[1024, 'age'] = male_mean # has no sibsp as I looked
df.at[128, 'age'] = child_mean
df.at[593, 'age'] = child_mean
df.at[166, 'age'] = female_mean
df.at[533, 'age'] = female_mean
df.at[593, 'age'] = female_mean
df.at[1116, 'age'] = female_mean
df.at[140, 'age'] = female_mean
age_nan = df[df['age'].isna()] # update the age_nan


## other nans

In [105]:

# fill fare nan as the average of 3 class . 

t_class = df[df['pclass'] == 3]
t_class_mean = t_class['fare'].mean()
df.at[1043, 'fare'] = t_class_mean

# fill nan embarked to be an class of itself
df.at[61, 'embarked'] = 'M'
df.at[829, 'embarked'] = 'M'



In [106]:
title, last_name = parse_name(df) # parse names and fill name as last name to categorize it 
df['name'] = last_name 
df['cabin'] = df['cabin'].fillna(5) # all the nans in cabin are a caegory 5 

df = cat_to_numeric(df)

survived = df['survived']
passenger_id = df['passengerid'].values

df = df.drop(columns=['survived','passengerid','name', 'sibsp', 'ticket'])

In [107]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   sex       1309 non-null   int64  
 2   age       1309 non-null   float64
 3   parch     1309 non-null   int64  
 4   fare      1309 non-null   float64
 5   cabin     1309 non-null   int64  
 6   embarked  1309 non-null   int64  
dtypes: float64(2), int64(5)
memory usage: 71.7 KB


In [108]:
#df = df.drop(columns=['cabin', 'name', 'ticket']) # for now drop cabin

# scaling

In [109]:
numerical_columns_selector = selector(dtype_exclude=int)
categorical_columns_selector = selector(dtype_include=int)

numerical_columns = numerical_columns_selector(df)
categorical_columns = categorical_columns_selector(df)

In [110]:
print(numerical_columns)
print(categorical_columns)

['age', 'fare']
['pclass', 'sex', 'parch', 'cabin', 'embarked']


In [111]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [112]:
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [113]:
model = make_pipeline(preprocessor, SVC(kernel='rbf') )

In [114]:
# divide back into train test sets 
df_train = df.iloc[:890,:]
df_test = df.iloc[891:,:]

x_train = df_train
y_train = survived.iloc[:890]


x_test = df_test
y_test = survived.iloc[891:]

In [115]:
model = model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

# models

In [116]:
# # rf
# clf = LogisticRegression(random_state=9, max_iter=500, solver = 'newton-cg') 
# clf = SGDClassifier(max_iter=1000, tol=1e-3)
# clf = SVC(kernel='poly')
# clf = KNeighborsClassifier()
# clf = GradientBoostingClassifier(n_estimators=40, learning_rate=0.2,max_depth=2, random_state=0)
# clf = LinearSVC(random_state=0, tol=1e-5)

In [117]:
precision, recall, f_1, support = precision_recall_fscore_support(y_train, y_train_pred, average='binary')
accuracy = accuracy_score(y_train, y_train_pred)
print('accuracy :', np.round(accuracy,3))
print('precision :', np.round(precision,3))
print('recall :', np.round(recall,3))
print('f_score :', np.round(f_1,3))

accuracy : 0.842
precision : 0.907
recall : 0.655
f_score : 0.761


In [118]:
cv_3_accuracy = cross_val_score(model, x_train , y_train, cv=3, scoring='accuracy')
cv_3_precision = cross_val_score(model, x_train , y_train, cv=3, scoring='precision')
cv_3_recall = cross_val_score(model, x_train , y_train, cv=3, scoring='recall')
cv_3_f1 = cross_val_score(model, x_train , y_train, cv=3, scoring='f1')

print('cv_avg_accuracy :', np.round(cv_3_accuracy.mean(),3))
print('cv_avg_precision', np.round(cv_3_precision.mean(),3))
print('cv_avg_recall', np.round(cv_3_recall.mean(),3))
print('cv_avg_fscore :', np.round(cv_3_f1.mean(),3))

cv_avg_accuracy : 0.813
cv_avg_precision 0.829
cv_avg_recall 0.652
cv_avg_fscore : 0.728


# test

In [37]:
y_test_pred = model.predict(x_test)
y_test_pred = y_test_pred.astype(int)

In [38]:

submission_df = pd.DataFrame({'Passengerid': passenger_id[891:], 'Survived': y_test_pred})
submission_df.to_csv('titanic_pred_9.csv', index = False)
