### Import libraries

In [472]:
import pandas as pd
import numpy as np
import os

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff

In [473]:
if os.name == 'nt':
    data_path = r"D:\Coding_pratice\_Data\titanic"

else:
    data_path = "/Users/admin/_Work/Data/Practice/titanic/"

In [474]:
train = pd.read_csv(os.path.join(data_path,"train.csv"))
test = pd.read_csv(os.path.join(data_path, "test.csv"))

In [475]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [476]:
train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [477]:
def missing_percentage(df, show=True):
    total = df.isnull().sum().sort_values(ascending= False)
    percent = round(total / len(df)* 100, 2)

    return_df = pd.concat([total, percent], axis = 1, keys=['Total NA', 'Percent'])
    if show: print(return_df)

    return return_df

def percent_value_count(df, feature):
    total = df[feature].value_counts(dropna=False)
    percent = round(df[feature].value_counts(dropna= False, normalize=True) * 100, 2)

    return_df = pd.concat([total, percent], axis=1, keys=['Total', ' Percent'])

    return return_df

_ = missing_percentage(train)
_ = missing_percentage(test)


             Total NA  Percent
Cabin             687    77.10
Age               177    19.87
Embarked            2     0.22
PassengerId         0     0.00
Survived            0     0.00
Pclass              0     0.00
Name                0     0.00
Sex                 0     0.00
SibSp               0     0.00
Parch               0     0.00
Ticket              0     0.00
Fare                0     0.00
             Total NA  Percent
Cabin             327    78.23
Age                86    20.57
Fare                1     0.24
PassengerId         0     0.00
Pclass              0     0.00
Name                0     0.00
Sex                 0     0.00
SibSp               0     0.00
Parch               0     0.00
Ticket              0     0.00
Embarked            0     0.00


## Start filling empty value
Begin withh: 
- Embarked
- Age
- Cabin

### Working on Embarked

In [478]:
display(percent_value_count(train, 'Embarked'))
display(train[train.Embarked.isna()])

Unnamed: 0,Total,Percent
S,644,72.28
C,168,18.86
Q,77,8.64
,2,0.22


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [479]:
fig = make_subplots(rows=1, cols=2, subplot_titles=["Training set", "Test set"])

df = [train, test]

for i in range(len(df)):
    for t in px.box(df[i].sort_values(by=['Embarked','Pclass']), x = 'Embarked', y = 'Fare', color = 'Pclass').data:
        fig.add_trace(t, row=(i//2) + 1, col=(i%2) + 1)

fig.update_layout(height=600, width=800 ,boxmode='group', yaxis_title= "Fare ($)"
).update_traces(
    showlegend=False, selector=lambda t: "Fare" not in t.hovertemplate)

fig.show()

## -> Fill missing Embraked value as C

In [480]:
train.Embarked.fillna('C', inplace=True)

## Working on Cabin

In [481]:
print(train.Cabin.isna().sum()/ len(train))
print(test.Cabin.isna().sum()/ len(test))

0.7710437710437711
0.7822966507177034


## Merge train and test cabin

In [482]:
train_prefix = train.drop(columns=['Survived'])
Survived = train['Survived']
all_data = pd.concat([train_prefix, test], axis=0)
all_data.Cabin.fillna('N', inplace=True)
all_data.Cabin = [i[0] for i in all_data.Cabin]
all_data.reset_index(inplace=True, drop=True)

In [483]:
percent_value_count(all_data, "Cabin")

Unnamed: 0,Total,Percent
N,1014,77.46
C,94,7.18
B,65,4.97
D,46,3.51
E,41,3.13
A,22,1.68
F,21,1.6
G,5,0.38
T,1,0.08


In [484]:
fig = px.box(all_data, y='Fare', x='Cabin')
fig.show()

## From Pclass, Fare, Embarked to Cabin

In [485]:
not_na_cabin = all_data[all_data['Cabin'] != "N"]
na_cabin = all_data[all_data['Cabin'] == "N"]
# not_na_cabin.reset_index(inplace=True, drop=True)
# print(not_na_data.loc[np.where(not_na_data['Fare'] == 0)[0]])

means = not_na_cabin.groupby("Cabin")['Fare'].mean()
std = not_na_cabin.groupby("Cabin")['Fare'].std()
upper = means + std
lower = means - std
cabins = pd.concat([std, means, lower, upper], axis=1, keys=['std', 'mean', 'lower', 'upper'])
cabins.dropna(inplace=True)
cabins

Unnamed: 0_level_0,std,mean,lower,upper
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,20.140358,41.244314,21.103956,61.384671
B,115.312993,122.383078,7.070085,237.696072
C,72.912034,107.926598,35.014564,180.838632
D,28.126283,53.007339,24.881056,81.133622
E,37.738225,54.564634,16.82641,92.302859
F,12.215124,18.079367,5.864242,30.294491
G,3.416419,14.205,10.788581,17.621419


In [486]:
def cabin_estimator(i, cabin_df):
    # Check from lower boundary
    try:
        lower_b = (i - cabin_df['lower']) > 0 # list(bool)
        mean_b = (cabin_df['mean'] - i) > 0
        upper_b = (cabin_df['upper'] - i) > 0
        
        # If in lower boundary
        in_lower = lower_b & mean_b
        # If in upper boundary
        in_upper = upper_b & ~mean_b

        # Process in lower
        if sum(in_lower) >= 1:
            close_to_mean = abs(cabin_df[in_lower]['mean'] - i)
            chosen_cabin = close_to_mean.idxmin()

        # Process out of lower
        else:
            chosen_cabin = 'G'

        # Process in upper
        if sum(in_upper) >= 1:
            close_to_mean = abs(cabin_df[in_upper]['mean'] - i)
            chosen_cabin = close_to_mean.idxmin()

        # Process out of upper
        else:
            chosen_cabin = 'B'

    except:
        print(sum(lower_b))
        print(i)

    return chosen_cabin

In [487]:
na_cabin['Cabin'] = na_cabin.Fare.apply(lambda x: cabin_estimator(x, cabins))

all_data = pd.concat([na_cabin, not_na_cabin], axis=0)
all_data.sort_values(by='PassengerId', inplace=True)
# all_data.reset_index(inplace=True, drop=True)

train = all_data[:891]
test = all_data[891:]
train['Survived'] = Survived

In [488]:
"""
    Filling empty Fare in Test
"""
display(test[test.Fare.isna()])
filling_value = all_data[
    (all_data.Pclass == 3) &
    (all_data.Sex == 'male') &
    (all_data.Embarked =='S') &
    (all_data.Cabin == 'B')
    ].Fare.mean()
filling_value

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1043,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,B,S


9.300610652920966

In [489]:
test.Fare.fillna(filling_value, inplace=True)

## Gender and Survived

In [490]:
fig = px.histogram(train, x='Sex', text_auto='True',
                    color='Survived',barmode='group'
                    )

fig.show()

## Pclass and Survived

In [491]:
temp = train[['Pclass', 'Survived', 'PassengerId']].groupby(['Pclass', 'Survived']).count().reset_index()
temp_df = pd.pivot_table(temp, values = 'PassengerId', index = 'Pclass',columns = 'Survived')
names = ['No', 'Yes']
temp_df.columns = names
idx_name = ['Upper', 'Middle', ' Lower']
temp_df.index = idx_name
for idx in temp_df.index:
    temp_df.loc[idx] = round(temp_df.loc[idx] / (temp_df.loc[idx].sum()) * 100, 2)

fig = px.histogram(temp_df, text_auto='True',
    x=temp_df.index, y=[temp_df['No'], temp_df['Yes']],
    labels={'variable':'Survived'}
    )

fig.update_layout( xaxis_title="Pclass", yaxis_title="Percentage", height=600, width=800)
fig.show()

In [492]:
group_labels=['survived', 'not survived']

fig = ff.create_distplot(
    [train.Pclass[train.Survived == 1], 
    train.Pclass[train.Survived == 0]], 
    bin_size=.3,
    group_labels=group_labels,
    # show_hist=False
)

fig.update_layout(title_text='Passenger class distribution - Survived & Non-survived',
    height=600, width=800
    )
fig.show()

In [493]:
fig = ff.create_distplot(
    [train.Fare[train.Survived == 1], 
    train.Fare[train.Survived == 0]], 
    group_labels=group_labels, bin_size=4,
    show_hist=False,
    # show_rug=False,
    curve_type='kde'
)

fig.update_layout(title='Fare distribution Survived vs Non-Survived',height=600, width=800)
fig.show()

In [494]:
temp_df = train.copy()
temp_df.dropna(inplace=True, subset='Age')

fig = ff.create_distplot(
    [temp_df.Age[temp_df.Survived == 1], 
    temp_df.Age[temp_df.Survived == 0]], 
    group_labels=group_labels, 
    # bin_size=.0125,
    show_hist=False,
    curve_type='kde'
)

fig.update_layout(
    title='Age distribution Survived vs Non-Survived',
    height=600, width=800
    )
fig.show()

In [495]:

fig = px.histogram(
    train,
    x ='Age',
    color='Survived',
    facet_col='Sex',
    facet_row='Survived',
    nbins=16, 
    # text_auto=True
)
fig.update_layout(
    bargap=0.2, title='Survived by Sex and Age',
    height=600, width=800
)
fig.show()

In [496]:

fig = px.histogram(
    train,
    x ='Age',
    color='Survived',
    facet_col='Sex',
    facet_row='Embarked',
    nbins=16, 
    # text_auto=True,
    barmode='stack'
)
fig.update_layout(
    bargap=0.2, title='Survived by Sex and Age',
    height=800, width=800
)
fig.show()

In [497]:
temp = train[['Embarked', 'Survived', 'Sex', 'PassengerId']].groupby(['Embarked', 'Sex', 'Survived']).count().reset_index()
temp_df = pd.pivot_table(temp, values = 'PassengerId', index = ['Embarked', 'Sex'],columns = ['Survived'])
for idx in temp_df.index:
    temp_df.loc[idx, 'Total'] = temp_df.loc[idx].sum()
    temp_df.loc[idx, 0:1]= round(temp_df.loc[idx, 0:1] / temp_df.loc[idx, 'Total'] * 100, 2)

temp_df


In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior)



Unnamed: 0_level_0,Survived,0,1,Total
Embarked,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C,female,12.0,88.0,75.0
C,male,69.47,30.53,95.0
Q,female,25.0,75.0,36.0
Q,male,92.68,7.32,41.0
S,female,31.03,68.97,203.0
S,male,82.54,17.46,441.0


In [498]:
temp_df = train.copy()
temp_df["Survived"] = temp_df["Survived"].astype(str)

fig = px.scatter(
    temp_df,
    x = 'Fare',
    y = 'Age',
    color='Survived',
    facet_col = 'Sex'
)

fig.update_layout(
    title='Survived by Sex, Age and Fare',
    height=500, width=900 
)
fig.show()

In [499]:
# Remove outlier Fare value
train = train[train.Fare < 500]

temp_df = train[['Parch', 'Survived', 'PassengerId']].groupby('Parch').agg(
    mean=('Survived', 'mean'),
    total_samples=('PassengerId', 'count'),
    std=('Survived', 'std')
)
temp_df.fillna(0, inplace=True)
temp_df['ste'] = temp_df['std'] / np.sqrt(temp_df['total_samples'])

fig = px.line(
    temp_df,
    x= sorted(train['Parch'].unique()),
    y= temp_df['mean'],
    error_y= temp_df['ste'],
    markers=True
)
fig.update_layout(
    xaxis_title='Parch',
    yaxis_title='Survived',
    title='Factorplot of Parent/Children survived',
    yaxis_range=[0,1]
    # height=800, width=600
)

fig.show()

In [500]:
temp_df = train[['SibSp', 'Survived', 'PassengerId']].groupby('SibSp').agg(
    mean=('Survived', 'mean'),
    total_samples=('PassengerId', 'count'),
    std=('Survived', 'std')
)
temp_df.fillna(0, inplace=True)
temp_df['ste'] = temp_df['std'] / np.sqrt(temp_df['total_samples'])

fig = px.line(
    temp_df,
    x= sorted(train['SibSp'].unique()),
    y= temp_df['mean'],
    error_y= temp_df['ste'],
    markers=True
)
fig.update_layout(
    xaxis_title='SibSp',
    yaxis_title='Survived',
    title='Factorplot of Sibling/Spouses survived',
    # yaxis_range=[0,1],
    # height=800, width=600
)

fig.show()

In [501]:
train['Sex'] = train['Sex'].apply(lambda x: 0 if x == 'female' else 1)
test['Sex'] = test['Sex'].apply(lambda x: 0 if x == 'female' else 1)

In [502]:
corr = train.corr()
corr = np.round(corr, 4)
mask = np.triu(np.ones_like(corr, dtype=bool))
df_mask = corr.mask(mask)

fig = ff.create_annotated_heatmap(z=df_mask.to_numpy(), 
                                  x=df_mask.columns.tolist(),
                                  y=df_mask.columns.tolist(),
                                  colorscale=px.colors.diverging.RdBu,
                                  hoverinfo="none", #Shows hoverinfo for null values
                                  showscale=True, ygap=1, xgap=1
                                 )

fig.update_xaxes(side="bottom")

fig.update_layout(
    title_text='Heatmap correlation', 
    title_x=0.5, 
    # width=1000, 
    # height=1000,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    yaxis_autorange='reversed',
    template='plotly_white'
)

for i in range(len(fig.layout.annotations)):
    if fig.layout.annotations[i].text == 'nan':
        fig.layout.annotations[i].text = ""

fig.show()

In [503]:
import random
random.seed(123)

male = train[train.Sex == 1]
female = train[train.Sex == 0]

m_mean_samples = []
f_mean_samples = []

for i in range(50):
    m_mean_samples.append(np.mean(random.sample(list(male['Survived']), 50)))
    f_mean_samples.append(np.mean(random.sample(list(female['Survived']), 50)))

print("Male sample mean: {}".format(round(np.mean(m_mean_samples), 2)))
print("Female sample mean: {}".format(round(np.mean(f_mean_samples), 2)))
print("Difference between male and female mean sample mean: {}".format(round(np.mean(f_mean_samples) - np.mean(m_mean_samples),2)))

Male sample mean: 0.18
Female sample mean: 0.76
Difference between male and female mean sample mean: 0.57


## Feature engineering

In [504]:
def get_title(name):
    x = name.split('.')[0]
    x = x.split(',')[1]
    x = x.strip()

    return x

def get_surname(name):
    x = name.split(',')[0]

    return x

def name_conversion(x):
    result = ''
    if x in [
        'Don', 'Dona', 'Rev', 'Dr', 'Major', 'Sir', 'Col', 'Capt', 'Jonkheer', 'Lady', 'the Countess']:
        result = 'rare'

    elif x in ['Ms', 'Mlle', 'Mme']:
        result = 'Miss'

    else:
        result = x

    return result


train['title'] = train.Name.apply(lambda x: get_title(x))
test['title'] = test.Name.apply(lambda x: get_title(x))

train['surname'] = train.Name.map(get_surname)
test['surname'] = test.Name.map(get_surname)

In [505]:
fig = px.histogram(train, x='title', color='Survived')

fig.update_layout(title='Title distribution', height=600, width=800)
fig.show()

In [506]:
train['title'] = train.title.apply(lambda x: name_conversion(x))
test['title'] = test.title.apply(lambda x: name_conversion(x))

In [507]:
train['family_size'] = train.Parch + train.SibSp + 1
test['family_size'] = test.Parch + test.SibSp + 1

def family_group(x):
    f_g = ''
    if x <= 1:
        f_g = 'loner'

    elif x <= 4:
        f_g = 'small' 

    else:
        f_g = 'large'
    
    return f_g

train['family_group'] = train['family_size'].map(family_group)
test['family_group'] = test['family_size'].map(family_group)

train['is_alone'] = [1 if i < 2 else 0 for i in train.family_size]
test['is_alone'] = [1 if i < 2 else 0 for i in test.family_size]

In [508]:
fig = px.histogram(
    train,
    x= 'family_group',
    color='Survived',
    barmode='group'
)

fig.update_layout(
    title='Family group',
    bargap=0.2,
    height=600,
    width=600
)
fig.show()

In [509]:
train.drop(['Ticket'], inplace=True, axis=1)
test.drop(['Ticket'], inplace=True, axis=1)

train['calculated_fare'] = train.Fare / train.family_size
test['calculated_fare'] = test.Fare / test.family_size

In [510]:
def fare_group(fare):
    re = ''
    if fare <= 4:
        re = 'very_low'
    elif fare <= 10:
        re = 'low'
    elif fare <= 20:
        re = 'mid'
    elif fare <= 45:
        re = 'high'
    else: 
        re = 'very_high'
    
    return re

In [511]:
train['fare_group'] = train.calculated_fare.map(fare_group)
test['fare_group'] = test.calculated_fare.map(fare_group)

In [512]:
fig = px.histogram(
    train,
    x='fare_group',
    color='Survived',
    facet_row='Pclass',
    facet_col='Sex',
    category_orders=dict(fare_bins=["very_low", "low", "mid", "high", "very_high"]),
    nbins=50
)

fig.update_layout(
    bargap= .2,
    title='Fare, Pclass and Sex',
    height= 600
)
fig.show()

In [513]:
train_PassengerId = train['PassengerId']
test_PassengerId = test['PassengerId']
train.drop(['PassengerId'], axis=1, inplace=True)
test.drop(['PassengerId'], axis=1, inplace=True)

train_len = len(train)
train_prefix = train.drop(columns=['Survived'])
Survived = train['Survived']
all_data = pd.concat([train_prefix, test], axis=0)

all_data = pd.get_dummies(all_data, columns=['title',"Pclass", 'Cabin','Embarked', 'family_group', 'fare_group'], drop_first=False)
# test = pd.get_dummies(test, columns=['title',"Pclass",'Cabin','Embarked', 'family_group', 'fare_group'], drop_first=False)
all_data.drop(
    [
        'Name', 
        # 'family_size',
        # 'Fare',
        'surname'], axis=1, inplace=True)
# test.drop(
#     [
#         'Name',
#         # 'family_size',
#         # "Fare",
#         'surname'], axis=1, inplace=True)
# train = all_data[:train_len]
# test = all_data[train_len:]
# train['Survived'] = Survived
# train = pd.concat([train[["Survived", "Age", "Sex","SibSp","Parch", 'Fare', 'family_size']], train.loc[:,"is_alone":]], axis=1)
all_data = pd.concat([all_data[["Age", "Sex"]], all_data.loc[:,"SibSp":]], axis=1)

In [514]:
"""
    Filling age with a predictor
"""
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import mean_squared_error, make_scorer

from pytorch_tabnet.tab_model import TabNetRegressor

def completing_age(df, export_rs=True):
    max_depth = list(range(2,5))
    n_estimator = list(range(150, 200, 5))
    criterion = ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']

    param = {
        'n_estimators': n_estimator,
        'max_depth': max_depth,
        'criterion': criterion
    }

    age_df = df.loc[:, 'Age':]
    temp_train = age_df.loc[age_df.Age.notnull()]
    temp_inference = age_df.loc[age_df.Age.isnull()]

    X, y = temp_train.loc[:, 'Sex':].values, temp_train['Age'].values
    
    rfr = RandomForestRegressor(n_estimators=1500, n_jobs=-1)
    if export_rs:
        rfr.fit(X, y)
        predicted_age = rfr.predict(temp_inference.loc[:, 'Sex':])
        df.loc[df.Age.isnull(), 'Age'] = predicted_age

        return df

In [515]:
completing_age(all_data)


X has feature names, but RandomForestRegressor was fitted without feature names



Unnamed: 0,Age,Sex,SibSp,Parch,Fare,family_size,is_alone,calculated_fare,title_Master,title_Miss,...,Embarked_Q,Embarked_S,family_group_large,family_group_loner,family_group_small,fare_group_high,fare_group_low,fare_group_mid,fare_group_very_high,fare_group_very_low
0,22.000000,1,1,0,7.2500,2,0,3.625000,0,0,...,0,1,0,0,1,0,0,0,0,1
1,38.000000,0,1,0,71.2833,2,0,35.641650,0,0,...,0,0,0,0,1,1,0,0,0,0
2,26.000000,0,0,0,7.9250,1,1,7.925000,0,1,...,0,1,0,1,0,0,1,0,0,0
3,35.000000,0,1,0,53.1000,2,0,26.550000,0,0,...,0,1,0,0,1,1,0,0,0,0
4,35.000000,1,0,0,8.0500,1,1,8.050000,0,0,...,0,1,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,29.878256,1,0,0,8.0500,1,1,8.050000,0,0,...,0,1,0,1,0,0,1,0,0,0
1305,39.000000,0,0,0,108.9000,1,1,108.900000,0,0,...,0,0,0,1,0,0,0,0,1,0
1306,38.500000,1,0,0,7.2500,1,1,7.250000,0,0,...,0,1,0,1,0,0,1,0,0,0
1307,29.878256,1,0,0,8.0500,1,1,8.050000,0,0,...,0,1,0,1,0,0,1,0,0,0


In [516]:
fig = ff.create_distplot(
    [all_data['Age']],
    group_labels=['Train'],
    
)
fig.update_layout(title='Age distribution')
fig.show()

In [517]:
def age_group(age):
    re = ''
    if age <= 1: re = 'infant'
    elif age <= 4: re = 'toddler'
    elif age <= 13: re = 'child'
    elif age <= 18: re = 'teenager'
    elif age <= 30: re = 'young_adult'
    elif age <= 45: re = 'adult'
    elif age <= 55: re = 'middle_aged'
    elif age <= 65: re = 'senior_citizen'
    else: re = 'old'

    return re

all_data['age_group'] = all_data.Age.map(age_group)
all_data = pd.get_dummies(all_data, columns=['age_group'], drop_first=True)

In [518]:
train = all_data[:train_len]
test = all_data[train_len:]
train['Survived'] = Survived
# train = pd.concat([train[["Survived", "Age", "Sex","SibSp","Parch"]], train.loc[:,"is_alone":]], axis=1)
# test = pd.concat([test[["Age", "Sex"]], test.loc[:,"SibSp":]], axis=1)

In [519]:
X = train.drop(labels=['Survived'], axis=1)
y = train['Survived']

In [520]:
def fill_missing_columns(train, test):
    missing_cols = list(
        set(train.columns) - set(test.columns)
    )
    for col in missing_cols:
        test[col] = 0
    test = test[train.columns]
    return test

# test = fill_missing_columns(X, test)

In [521]:
X, X_test, y, y_test = train_test_split(X, y , test_size=0.25, random_state=123)

In [522]:
from sklearn.preprocessing import StandardScaler

def features_scaling(df_train, df_test, scaling_method=StandardScaler()):

    for feature in df_train.columns:
        # Apply scaling on continuous data only
        if len(df_train[feature].unique()) > 2:
            df_train[feature] = scaling_method.fit_transform(df_train[feature].values.reshape(-1, 1))
            df_test[feature] = scaling_method.transform(df_test[feature].values.reshape(-1, 1))

    return df_train, df_test

X, test = features_scaling(X, test)

## Feature Important

In [523]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear', random_state=123)

C_vals = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,16.5,17,17.5,18]
penalties = ['l1','l2']
cv = StratifiedShuffleSplit(n_splits=10, test_size = .25, random_state=123)
param = {'penalty': penalties, "C":C_vals}

grid_= GridSearchCV(
    logreg, param, scoring='accuracy', cv=cv
)
grid_.fit(X,y)

display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

logreg_grid = grid_.best_estimator_
logreg_grid.fit(X, y)
display(logreg_grid.score(X_test, y_test))

0.8311377245508982

{'C': 0.5, 'penalty': 'l1'}

0.5990990990990991

In [524]:
from sklearn.neighbors import KNeighborsClassifier

k_range = range(1,31)
weight_options = ['uniform', 'distance']
param = {
    'n_neighbors': k_range,
    'weights': weight_options
}
grid_ = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param,
    cv=cv
)
grid_.fit(X, y)
display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

knn_grid = grid_.best_estimator_
display(knn_grid.score(X_test, y_test))

0.8167664670658683

{'n_neighbors': 17, 'weights': 'uniform'}

0.5675675675675675

In [548]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', probability=True)
Cs = [0.001, 0.01, 0.1, 1,1.5,2,2.5,3,4,5, 10]
gammas = [0.0001,0.001, 0.01, 0.1, 1]   
param = {
    'C': Cs,
    'gamma': gammas
}
cv = StratifiedShuffleSplit(n_splits=10, test_size=.25, random_state=123)
grid_ = GridSearchCV(
    svm,
    param_grid=param,
    cv=cv
)
grid_.fit(X, y)

display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

svm_grid = grid_.best_estimator_    
svm_grid.fit(X, y)
display(svm_grid.score(X_test, y_test))

0.8353293413173652

{'C': 2, 'gamma': 0.01}

0.6126126126126126

In [549]:
from sklearn.tree import DecisionTreeClassifier

max_depth = range(1,30)
max_features = list(range(len(X.columns)// 2, len(X.columns))) + ['sqrt']
criterion = ['entropy', 'gini']

param = {
    'max_depth': max_depth,
    'max_features': max_features,
    'criterion': criterion
}

grid_ = GridSearchCV(
    DecisionTreeClassifier(random_state=123),
    param_grid=param,
    cv=cv
)
grid_.fit(X, y)
display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

dtree_grid = grid_.best_estimator_    
dtree_grid.fit(X,y)
display(dtree_grid.score(X_test, y_test))

0.8311377245508982

{'criterion': 'gini', 'max_depth': 3, 'max_features': 42}

0.8243243243243243

In [550]:
from sklearn.metrics import accuracy_score

def permutation_feature(df, feature):
    permuted_df = df.copy(deep=True)

    np.random.seed(123)
    permuted_feature = np.random.permutation(permuted_df[feature])
    permuted_df[feature] = permuted_feature

    return permuted_df

def permutation_importance(X, y, model, metric):
    importances = pd.DataFrame(index=['importance'], columns=X.columns)

    baseline_performance = metric(y, model.predict(X))

    for feature in importances.columns:
        perm_X = permutation_feature(X, feature)
        feature_performance = metric(y, model.predict(perm_X))

        importances[feature]['importance'] = np.abs(baseline_performance - feature_performance)

    return importances

In [551]:
importances = permutation_importance(X, y, dtree_grid, accuracy_score)
importances = importances.loc[:, (importances != 0).any(axis=0)]

fig = px.bar(
    x=importances.columns,
    y=importances.values[0]
)

fig.update_layout(
    title= "Feature importance",
    xaxis={'categoryorder':'total descending'}, 
    xaxis_title='Features',
    yaxis_title='Value'
    )
fig.show()

In [552]:
feature_importants = pd.DataFrame(
    dtree_grid.feature_importances_,
    index = X.columns,
    columns=['Important']
)
feature_importants.sort_values(by='Important', ascending=False).head(10)

Unnamed: 0,Important
title_Mr,0.645038
Pclass_3,0.138809
family_group_large,0.079617
calculated_fare,0.065265
title_rare,0.050587
Cabin_D,0.020684
fare_group_very_high,0.0
Embarked_S,0.0
family_group_loner,0.0
family_group_small,0.0


In [530]:
from sklearn.ensemble import RandomForestClassifier

max_depth = list(range(1,10))
n_estimator = list(range(160, 200, 5))
criterion = ['entropy', 'gini']
param = {
    'n_estimators': n_estimator,
    'max_depth': max_depth,
    'criterion': criterion
}
grid_ = GridSearchCV(
    estimator=RandomForestClassifier(max_features='sqrt', random_state=123),
    param_grid = param,
    cv=cv
)
grid_.fit(X, y)
display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

rforest_grid = grid_.best_estimator_    

display(rforest_grid.score(X_test, y_test))

0.8365269461077844

{'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 165}

0.8198198198198198

In [531]:
feature_importants = pd.DataFrame(
    rforest_grid.feature_importances_,
    index = X.columns,
    columns=['Important']
)
feature_importants.sort_values(by='Important', ascending=False).head(10)

Unnamed: 0,Important
title_Mr,0.17424
Sex,0.149642
Fare,0.084076
calculated_fare,0.072457
Age,0.062001
title_Miss,0.057174
Pclass_3,0.048401
title_Mrs,0.041223
family_size,0.027226
Pclass_1,0.026564


In [541]:
from sklearn.ensemble import BaggingClassifier

n_estimators = list(range(345, 400, 5))
param = {
    "n_estimators": n_estimators
}
grid_ = GridSearchCV(
    BaggingClassifier(base_estimator=None, random_state=123,
    bootstrap=False),
    param_grid = param,
    cv=cv
)
grid_.fit(X, y)
display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

bagging_grid = grid_.best_estimator_    

display(bagging_grid.score(X_test, y_test))

0.7796407185628742

{'n_estimators': 345}

0.8018018018018018

In [534]:
from sklearn.ensemble import AdaBoostClassifier

n_estimators = list(range(80, 129, 5))
learning_r = [0.01, 0.0125, 0.025 ,0.05, 0.075 ,0.1, 0.125,0.5]
param = {
    'n_estimators': n_estimators,
    'learning_rate': learning_r
}
grid_ = GridSearchCV(
    AdaBoostClassifier(random_state=123),
    param_grid=param,
    cv=cv
)
grid_.fit(X, y)
display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

adaboost_grid = grid_.best_estimator_    

display(adaboost_grid.score(X_test, y_test))

0.8323353293413174

{'learning_rate': 0.1, 'n_estimators': 100}

0.7927927927927928

In [535]:
from sklearn.ensemble import ExtraTreesClassifier

max_depth = range(1,30)
# max_features = list(range(len(X.columns)// 2, len(X.columns))) + ['auto']
criterion = ['entropy', 'gini']

param = {
    'max_depth': max_depth,
    # 'max_features': max_features,
    'criterion': criterion
}

grid_ = GridSearchCV(
    ExtraTreesClassifier(random_state=123),
    param_grid=param,
    cv=cv
)
grid_.fit(X, y)
display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

xtree_grid = grid_.best_estimator_    

display(xtree_grid.score(X_test, y_test))

0.8311377245508982

{'criterion': 'entropy', 'max_depth': 6}

0.8198198198198198

In [536]:
from sklearn.ensemble import GradientBoostingClassifier

n_estimators = list(range(500, 1501, 500))
learning_r = [0.0125, 0.05, 0.075 ,0.1, 0.125, 0.25,0.5]
losses = ['log_loss', 'deviance', 'exponential']
param = {
    'n_estimators': n_estimators,
    'learning_rate': learning_r
}
grid_ = GridSearchCV(
    GradientBoostingClassifier(random_state=123),
    param_grid=param,
    cv=cv
)
grid_.fit(X, y)
display(grid_.best_score_)
display(grid_.best_params_)
display(grid_.best_estimator_)

gradboost_grid = grid_.best_estimator_    

display(gradboost_grid.score(X_test, y_test))

0.8233532934131738

{'learning_rate': 0.0125, 'n_estimators': 500}

0.7387387387387387

In [554]:
from pytorch_tabnet.tab_model import TabNetClassifier
import itertools

def tabnet_CV_train(params, X_train, y_train, X_val, y_val):
    
    # estimator_score = []
    # for i, (train_index, val_index) in enumerate(cv.get_n_splits(X, y)):
    #     X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    #     X_val, y_val = X.iloc[val_index], y.loc[val_index]

    tabnet = TabNetClassifier()
    params['n_a'] = params['n_d']
    tabnet.set_params(**params, verbose=0)
    tabnet.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[( X_val, y_val)], eval_name=['valid'],
        max_epochs=500, eval_metric=['accuracy'], patience=20
    )
    return tabnet, params

def holdout_grid_search(model_train, X_train, y_train, X_val, y_val, hyperparam, verbose):
    best_estimator = None
    best_hyperparam = {}
    best_score = 0

    hyper_param_l = list(hyperparam.values())
    combination_l_of_t = list(itertools.product(*hyper_param_l))
    combination_l_of_d = []

    for val_tuple in combination_l_of_t:
        param_d = {}

        for i, k in enumerate(hyperparam):
            param_d[k] = val_tuple[i]
        combination_l_of_d.append(param_d)

    for param_d in combination_l_of_d:
        estimator, param_d = model_train(param_d, X_train.values, y_train.values, X_val.values, y_val.values)

        y_prob = estimator.predict(X_val.values)
        estimator_score = accuracy_score(y_val, y_prob)
        
        if estimator_score > best_score:
            best_score = estimator_score
            best_estimator = estimator
            best_hyperparam = param_d

    if verbose:
        print("hyperparam:")
        display(hyperparam)
        
        print("hyper_param_l")
        display(hyper_param_l)
        
        print("combination_l_of_t")
        display(combination_l_of_t)
        
        print(f"combination_l_of_d")
        display(combination_l_of_d)
        
    print(f"best_hyperparam")
    display(best_hyperparam)
    print(f"best_score: {best_score:.4f}")

    return best_estimator, best_hyperparam


In [555]:
param_grid = dict(n_d = [8, 16],
                  n_a = [8],
                  n_steps = [3, 4, 5],
                  optimizer_params = [dict(lr=0.01), dict(lr=0.02)],
                  gamma = [1, 1.5, 2],
                  lambda_sparse = [1e-2, 1e-3, 1e-4],
                  momentum = [0.3, 0.4, 0.5],
                  n_shared = [2],
                  n_independent = [2],
                  clip_value = [2.],   
)
tabnet_grid, _ = holdout_grid_search(tabnet_CV_train,  X, y, X_test, y_test, param_grid, False)

Device used : cpu

Early stopping occurred at epoch 94 with best_epoch = 74 and best_valid_accuracy = 0.78378
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 101 with best_epoch = 81 and best_valid_accuracy = 0.77928
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 94 with best_epoch = 74 and best_valid_accuracy = 0.77928
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 32 with best_epoch = 12 and best_valid_accuracy = 0.62162
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 125 with best_epoch = 105 and best_valid_accuracy = 0.72523
Best weights from best epoch are automatically used!
Device used : cpu

Early stopping occurred at epoch 32 with best_epoch = 12 and best_valid_accuracy = 0.63964
Best weights from best epoch are automatically used!
Device used : cpu



{'n_d': 8,
 'n_a': 8,
 'n_steps': 3,
 'optimizer_params': {'lr': 0.02},
 'gamma': 1,
 'lambda_sparse': 0.01,
 'momentum': 0.4,
 'n_shared': 2,
 'n_independent': 2,
 'clip_value': 2.0}

best_score: 0.8108


In [538]:
from sklearn.ensemble import VotingClassifier

voting_classifier = VotingClassifier(estimators=[
    ('lr_grid', logreg_grid),
    ('svc', svm_grid),
    ('random_forest', rforest_grid),
    ('gradient_boosting', gradboost_grid),
    ('decision_tree_grid',dtree_grid),
    ('knn_classifier', knn_grid),
#     ('XGB_Classifier', XGBClassifier),
    ('bagging_classifier', bagging_grid),
    ('adaBoost_classifier',adaboost_grid),
    ('ExtraTrees_Classifier', xtree_grid)
],voting='hard')

voting_cls = voting_classifier.fit(X, y)

In [539]:
all_models = [
    logreg_grid,
    svm_grid,
    rforest_grid,
    gradboost_grid,
    dtree_grid,
    knn_grid,
    bagging_grid,
    adaboost_grid,
    xtree_grid,
    # tabnet_grid,
    voting_cls
]
c = {}

for i in all_models:
    r = i.predict(X_test)
    s = accuracy_score(y_test, r)
    c[i] = s

In [557]:
# result = (max( c, key=c.get)).predict(test)
result = tabnet_grid.predict(test.values)
print(len(result))
submission = pd.DataFrame(
    {
        "PassengerId": test_PassengerId,
        "Survived":result
    }
)

submission.Survived = submission.Survived.astype('int')
csv_path = os.path.join(data_path, 'Tabnet_CLS_submission.csv')
submission.to_csv(csv_path, index=False)

418
