In [565]:
# import packages 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os 
import sys
import seaborn as sns
import scipy as sp
import scipy.stats as stats
from scipy.stats import norm
%matplotlib inline

# import pre-processing modules 
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel 

# import classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

# import evaluation modules
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

In [566]:
test_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/test.csv")
train_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/train.csv")
df_original = pd.concat([train_set,test_set], axis=0, ignore_index=True )


In [567]:
df_train = (train_set.copy()).rename(columns=str.lower)
df_test = (test_set.copy()).rename(columns=str.lower)
df = (df_original.copy()).rename(columns=str.lower)


#df.info()
#df_test.info()

In [568]:
# parse title and fam name from df 
def parse_name(df):
    last_name = df['name'].apply(lambda x : x.split(', ')[0])
    temp = df['name'].apply(lambda x : x.split(', ')[1])
    title = temp.apply(lambda x : x.split('.')[0])
    return title, last_name



In [569]:
# work on df nan and then concat with original df_train and sort by ID
age_nan = df[df['age'].isna()]
#age_nan.info()
#age_nan = age_nan.sort_values(by=['sex'],ascending=True)
#age_nan = age_nan.sort_values(by=['sibsp'],ascending=False).reset_index(drop=True)
print('df updated nans',df['age'].isna().sum())
print('len age_nan',len(age_nan))


df updated nans 263
len age_nan 263


In [570]:
title, last_name = parse_name(age_nan)
title = title[title == 'Master']
master_index = title.index.to_numpy()

df_young = df[df["age"] < 15 ]
child_mean = round(df_young['age'].mean(),2)
       
for i in master_index:
    df.at[i,'age'] = child_mean 

print('df updated nans',df['age'].isna().sum())
print('len age_nan before update',len(age_nan))

age_nan = df[df['age'].isna()] # update the age_nan

print('len age_nan',len(age_nan))

df updated nans 255
len age_nan before update 263
len age_nan 255


In [571]:
l_p_t = age_nan[age_nan['sibsp'] == 0]
lone_pass = l_p_t[l_p_t['parch'] == 0] # passengers who travel alone and cant determine their age by relatives
#lone_pass

# calc mean by gender
df_male = df[df['sex'] == 'male']
df_female = df[df['sex'] == 'female']
male_mean = round(df_male['age'].mean(),2)
female_mean = round(df_female['age'].mean(),2)

lone_pass_male = lone_pass[lone_pass['sex'] == 'male']
lone_pass_female = lone_pass[lone_pass['sex'] == 'female']

l_p_m_index = (lone_pass_male.index).to_numpy()
l_p_f_index = (lone_pass_female.index).to_numpy()

for m in l_p_m_index:
    df.at[m,'age'] = male_mean

for f in l_p_f_index:
    df.at[f,'age'] = female_mean
    
age_nan = df[df['age'].isna()] # update the age_nan

print('len age_nan',len(age_nan))


len age_nan 56


In [572]:
# one companion in nan df # assign the age of the companion tot the nan 
one_comp_nan = age_nan[age_nan['parch'] == 0] 
one_comp_nan = one_comp_nan[one_comp_nan['sibsp'] == 1]
title_ocn, last_name_ocn = parse_name(one_comp_nan)
ocn_index = (title_ocn.index).to_numpy()


one_comp = df[df['parch'] == 0] # one companion in general df may contain istself. but some companions are both nan in age
one_comp = one_comp[one_comp['sibsp'] == 1]
#one_comp = one_comp[~one_comp['age'].isna()]
title_oc, last_name_oc = parse_name(one_comp)
oc_index = (title_oc.index).to_numpy()

for l in range(len(one_comp_nan)):
    for i in range(len(one_comp)):
        
        if (list(last_name_ocn)[l] == list(last_name_oc)[i] and ocn_index[l] != oc_index[i]): #ensures same last name and not same person
        
            if df.at[oc_index[i],'age'] != float('nan'):
                df.at[ocn_index[l],'age'] =  df.at[oc_index[i],'age'] 
            if list(title_ocn)[l] == 'Mrs' : # she and her husbnad are adults and get the adult average
                df.at[ocn_index[l],'age'] = female_mean
                df.at[oc_index[i],'age'] = male_mean
            if list(title_ocn)[l] == 'Miss': # she is a miss and I assume that travels with a sibling so get young average
                df.at[ocn_index[l],'age'] = child_mean
                df.at[oc_index[i],'age'] = child_mean
            if list(title_ocn)[l] == 'Mr' and list(title_oc)[i] != 'Mrs':
                df.at[ocn_index[l],'age'] = child_mean
                df.at[oc_index[i],'age'] = child_mean


age_nan = df[df['age'].isna()] # update the age_nan

print('len age_nan',len(age_nan))

len age_nan 29


In [573]:
# notice there are only two famillies with 3 siblings travelling together I assume they are all young <15.
three_siblings = age_nan[age_nan['parch'] == 0] 
three_siblings = three_siblings[three_siblings['sibsp'] == 2]
index_temp = three_siblings.index.to_numpy()
for i in index_temp:
    df.at[i,'age'] = child_mean

age_nan = df[df['age'].isna()] # update the age_nan

print('len age_nan',len(age_nan))


len age_nan 23


In [575]:
# hand fill, thought it will be quicker than thinking about an algo  
#parents
df.at[1233, 'age'] = male_mean
df.at[1256, 'age'] = female_mean
#kids
sages = df[df['ticket'] == 'CA. 2343']
sages = sages[sages['age'].isna()]

index_temp = sages.index.to_numpy()
for i in index_temp:
    df.at[i,'age'] = child_mean
age_nan = df[df['age'].isna()] # update the age_nan


In [576]:
df.at[1023, 'age'] = female_mean
lebfre = df[df['ticket'] == '4133']
lebfre = lebfre[lebfre['age'].isna()]
index_temp = lebfre.index.to_numpy()
for i in index_temp:
    df.at[i,'age'] = child_mean
age_nan = df[df['age'].isna()] # update the age_nan


In [577]:
#jhonston
#parents
df.at[783, 'age'] = male_mean
df.at[924, 'age'] = female_mean
df.at[888,'age'] = child_mean
age_nan = df[df['age'].isna()] # update the age_nan

In [582]:
df.at[1024, 'age'] = male_mean # has no sibsp as I looked
df.at[128, 'age'] = child_mean
df.at[593, 'age'] = child_mean
df.at[166, 'age'] = female_mean
df.at[533, 'age'] = female_mean
df.at[593, 'age'] = female_mean
df.at[1116, 'age'] = female_mean
df.at[140, 'age'] = female_mean
age_nan = df[df['age'].isna()] # update the age_nan