In [27]:
# import packages 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os 
import sys
import seaborn as sns
import scipy as sp
import scipy.stats as stats
from scipy.stats import norm
%matplotlib inline

# import pre-processing modules 
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel 

# import classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

# import evaluation modules
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

In [28]:
test_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/test.csv")
train_set = pd.read_csv(r"C:/Users/Yehonatan/PycharmProject/DS/projects/titanic/train.csv")
df_original = pd.concat([train_set,test_set], axis=0, ignore_index=True )


In [29]:
df_train = (train_set.copy()).rename(columns=str.lower)
df_test = (test_set.copy()).rename(columns=str.lower)

df = (df_original.copy()).rename(columns=str.lower)


#df.info()
#df_test.info()

In [30]:
# parse title and fam name from df 
def parse_name(df):
    last_name = df['name'].apply(lambda x : x.split(', ')[0])
    temp = df['name'].apply(lambda x : x.split(', ')[1])
    title = temp.apply(lambda x : x.split('.')[0])
    return title, last_name

def cat_to_numeric(df):
    cat_columns = df.select_dtypes(['object']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])
    return df 

In [31]:
# work on df nan and then concat with original df_train and sort by ID
age_nan = df[df['age'].isna()]
#age_nan.info()
#age_nan = age_nan.sort_values(by=['sex'],ascending=True)
#age_nan = age_nan.sort_values(by=['sibsp'],ascending=False).reset_index(drop=True)
print('df updated nans',df['age'].isna().sum())
print('len age_nan',len(age_nan))


df updated nans 263
len age_nan 263


In [32]:
title, last_name = parse_name(age_nan)
title = title[title == 'Master']
master_index = title.index.to_numpy()

df_young = df[df["age"] < 15 ]
child_mean = round(df_young['age'].mean(),2)
       
for i in master_index:
    df.at[i,'age'] = child_mean 

print('df updated nans',df['age'].isna().sum())
print('len age_nan before update',len(age_nan))

age_nan = df[df['age'].isna()] # update the age_nan

print('len age_nan',len(age_nan))

df updated nans 255
len age_nan before update 263
len age_nan 255


In [33]:
l_p_t = age_nan[age_nan['sibsp'] == 0]
lone_pass = l_p_t[l_p_t['parch'] == 0] # passengers who travel alone and cant determine their age by relatives
#lone_pass

# calc mean by gender
df_male = df[df['sex'] == 'male']
df_female = df[df['sex'] == 'female']
male_mean = round(df_male['age'].mean(),2)
female_mean = round(df_female['age'].mean(),2)

lone_pass_male = lone_pass[lone_pass['sex'] == 'male']
lone_pass_female = lone_pass[lone_pass['sex'] == 'female']

l_p_m_index = (lone_pass_male.index).to_numpy()
l_p_f_index = (lone_pass_female.index).to_numpy()

for m in l_p_m_index:
    df.at[m,'age'] = male_mean

for f in l_p_f_index:
    df.at[f,'age'] = female_mean
    
age_nan = df[df['age'].isna()] # update the age_nan

print('len age_nan',len(age_nan))


len age_nan 56


In [34]:
# one companion in nan df # assign the age of the companion tot the nan 
one_comp_nan = age_nan[age_nan['parch'] == 0] 
one_comp_nan = one_comp_nan[one_comp_nan['sibsp'] == 1]
title_ocn, last_name_ocn = parse_name(one_comp_nan)
ocn_index = (title_ocn.index).to_numpy()


one_comp = df[df['parch'] == 0] # one companion in general df may contain istself. but some companions are both nan in age
one_comp = one_comp[one_comp['sibsp'] == 1]
#one_comp = one_comp[~one_comp['age'].isna()]
title_oc, last_name_oc = parse_name(one_comp)
oc_index = (title_oc.index).to_numpy()

for l in range(len(one_comp_nan)):
    for i in range(len(one_comp)):
        
        if (list(last_name_ocn)[l] == list(last_name_oc)[i] and ocn_index[l] != oc_index[i]): #ensures same last name and not same person
        
            if df.at[oc_index[i],'age'] != float('nan'):
                df.at[ocn_index[l],'age'] =  df.at[oc_index[i],'age'] 
            if list(title_ocn)[l] == 'Mrs' : # she and her husbnad are adults and get the adult average
                df.at[ocn_index[l],'age'] = female_mean
                df.at[oc_index[i],'age'] = male_mean
            if list(title_ocn)[l] == 'Miss': # she is a miss and I assume that travels with a sibling so get young average
                df.at[ocn_index[l],'age'] = child_mean
                df.at[oc_index[i],'age'] = child_mean
            if list(title_ocn)[l] == 'Mr' and list(title_oc)[i] != 'Mrs':
                df.at[ocn_index[l],'age'] = child_mean
                df.at[oc_index[i],'age'] = child_mean


age_nan = df[df['age'].isna()] # update the age_nan

print('len age_nan',len(age_nan))

len age_nan 29


In [35]:
# notice there are only two famillies with 3 siblings travelling together I assume they are all young <15.
three_siblings = age_nan[age_nan['parch'] == 0] 
three_siblings = three_siblings[three_siblings['sibsp'] == 2]
index_temp = three_siblings.index.to_numpy()
for i in index_temp:
    df.at[i,'age'] = child_mean

age_nan = df[df['age'].isna()] # update the age_nan

print('len age_nan',len(age_nan))


len age_nan 23


In [36]:
# hand fill, thought it will be quicker than thinking about an algo  
#parents
df.at[1233, 'age'] = male_mean
df.at[1256, 'age'] = female_mean
#kids
sages = df[df['ticket'] == 'CA. 2343']
sages = sages[sages['age'].isna()]

index_temp = sages.index.to_numpy()
for i in index_temp:
    df.at[i,'age'] = child_mean
age_nan = df[df['age'].isna()] # update the age_nan


In [37]:
df.at[1023, 'age'] = female_mean
lebfre = df[df['ticket'] == '4133']
lebfre = lebfre[lebfre['age'].isna()]
index_temp = lebfre.index.to_numpy()
for i in index_temp:
    df.at[i,'age'] = child_mean
age_nan = df[df['age'].isna()] # update the age_nan


In [38]:
#jhonston
#parents
df.at[783, 'age'] = male_mean
df.at[924, 'age'] = female_mean
df.at[888,'age'] = child_mean
age_nan = df[df['age'].isna()] # update the age_nan

In [39]:
df.at[1024, 'age'] = male_mean # has no sibsp as I looked
df.at[128, 'age'] = child_mean
df.at[593, 'age'] = child_mean
df.at[166, 'age'] = female_mean
df.at[533, 'age'] = female_mean
df.at[593, 'age'] = female_mean
df.at[1116, 'age'] = female_mean
df.at[140, 'age'] = female_mean
age_nan = df[df['age'].isna()] # update the age_nan

In [40]:
#pd.set_option('display.max_rows', len(df))
#df.sort_values(by=['cabin'],ascending=False)


In [41]:
# fill fare nan as the average of 3 class . 
t_class = df[df['pclass'] == 3]
t_class_mean = t_class['fare'].mean()
df.at[1043, 'fare'] = t_class_mean


In [42]:
df.at[61, 'embarked'] = 'M'
df.at[829, 'embarked'] = 'M'

In [43]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  1309 non-null   int64  
 1   survived     891 non-null    float64
 2   pclass       1309 non-null   int64  
 3   name         1309 non-null   object 
 4   sex          1309 non-null   object 
 5   age          1309 non-null   float64
 6   sibsp        1309 non-null   int64  
 7   parch        1309 non-null   int64  
 8   ticket       1309 non-null   object 
 9   fare         1309 non-null   float64
 10  cabin        295 non-null    object 
 11  embarked     1309 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [44]:
title, last_name = parse_name(df)
df['name'] = last_name 
df['cabin'] = df['cabin'].fillna(5)
df = cat_to_numeric(df)

In [45]:
#df = df.drop(columns=['cabin', 'name', 'ticket']) # for now drop cabin


In [46]:
# divide back into train test sets 
df_train = df.iloc[:890,:]
df_test = df.iloc[891:,:]

#df_train.info()
#df_test.info()

In [47]:
corr_mat = df_train.corr()
corr_mat['survived'].sort_values(ascending=False)

survived       1.000000
sex            0.543053
cabin          0.270225
fare           0.256995
embarked       0.120736
parch          0.081248
name           0.019045
passengerid   -0.003479
sibsp         -0.035760
ticket        -0.045727
age           -0.065915
pclass        -0.337996
Name: survived, dtype: float64

# train

In [48]:
x_train = df_train.drop(columns = ['survived', 'passengerid'])
y_train = df_train['survived']
x_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    890 non-null    int64  
 1   name      890 non-null    int64  
 2   sex       890 non-null    int64  
 3   age       890 non-null    float64
 4   sibsp     890 non-null    int64  
 5   parch     890 non-null    int64  
 6   ticket    890 non-null    int64  
 7   fare      890 non-null    float64
 8   cabin     890 non-null    int64  
 9   embarked  890 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 69.7 KB


In [49]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)

x_test = df_test.drop(columns =  ['survived', 'passengerid'])
x_test = scaler.transform(x_test)


In [243]:
# rf
clf = RandomForestClassifier(n_estimators= 100, max_depth=2, random_state=9)
clf.fit(x_train, y_train)
y_train_pred = clf.predict(x_train)

In [247]:
clf = LogisticRegression(random_state=9, max_iter=500, solver = 'newton-cg') 
clf.fit(x_train, y_train)
y_train_pred = clf.predict(x_train)

In [381]:
clf = SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(x_train, y_train)
y_train_pred = clf.predict(x_train)

In [50]:
clf = SVC(gamma='auto')
clf.fit(x_train, y_train)
y_train_pred = clf.predict(x_train)

In [51]:
precision, recall, f_1, support = precision_recall_fscore_support(y_train, y_train_pred, average='binary')
accuracy = accuracy_score(y_train, y_train_pred)
print('accuracy :', np.round(accuracy,3))
print('precision :', np.round(precision,3))
print('recall :', np.round(recall,3))
print('f_score :', np.round(f_1,3))

accuracy : 0.846
precision : 0.836
recall : 0.746
f_score : 0.788


In [52]:
cv_3_accuracy = cross_val_score(clf, x_train , y_train, cv=3, scoring='accuracy')
cv_3_precision = cross_val_score(clf, x_train , y_train, cv=3, scoring='precision')
cv_3_recall = cross_val_score(clf, x_train , y_train, cv=3, scoring='recall')
cv_3_f1 = cross_val_score(clf, x_train , y_train, cv=3, scoring='f1')

print('cv_avg_accuracy :', np.round(cv_3_accuracy.mean(),3))
print('cv_avg_precision', np.round(cv_3_precision.mean(),3))
print('cv_avg_recall', np.round(cv_3_recall.mean(),3))
print('cv_avg_fscore :', np.round(cv_3_f1.mean(),3))

cv_avg_accuracy : 0.825
cv_avg_precision 0.812
cv_avg_recall 0.716
cv_avg_fscore : 0.757


# Test

In [349]:
y_test_pred = clf.predict(x_test)
y_test_pred = y_test_pred.astype(int)

In [350]:
passenger_id = df_test['passengerid'].values
submission_df = pd.DataFrame({'Passengerid': passenger_id, 'Survived': y_test_pred})
submission_df.to_csv('titanic_pred.csv', index = False)
