# Introduction

Titanic



# The Package

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px # For histograms
import plotly.graph_objects as go # For pie charts

from scipy.stats import chi2_contingency, kruskal, f_oneway, normaltest, bartlett

# Explore Dataset

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

display(train_df.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
DROP = ['Name', 'PassengerId','Cabin']    #irrelevant 
train_df = train_df.drop(columns=DROP)

In [5]:
display(train_df.head())

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,female,35.0,1,0,113803,53.1,S
4,0,3,male,35.0,0,0,373450,8.05,S


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


Now that in Age columns there are some null values, which must be dropped or filled with a mean value of the entire existing column. 

In [7]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
train_df = train_df.dropna(subset=['Embarked'])

train_df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

In [8]:
train_df.shape

(889, 9)

In [14]:
CAT_FEATURES = ['Sex','Ticket','Embarked']
NUM_FEATURES = ['Pclass','Age','SibSp','Parch','Fare']

LABEL = 'Survived'
FEATURES = train_df.columns.to_list()
FEATURES.remove(LABEL)

In [15]:
for feature in NUM_FEATURES:
    box_by_label = px.box(train_df, x=LABEL, y=feature, title=feature + " by survive/die")
    box_by_label.show()

Among the numerical features, SibSp does not look like it is useful to predict survival output. 

In [18]:
train_df.corr(method="spearman")

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.336917,-0.041027,0.090944,0.140126,0.320772
Pclass,-0.336917,1.0,-0.314204,-0.045279,-0.024696,-0.686166
Age,-0.041027,-0.314204,1.0,-0.143847,-0.2166,0.121971
SibSp,0.090944,-0.045279,-0.143847,1.0,0.449607,0.450749
Parch,0.140126,-0.024696,-0.2166,0.449607,1.0,0.413106
Fare,0.320772,-0.686166,0.121971,0.450749,0.413106,1.0


#Correlation

Only Pclass and Fare seem to be correlated with the survival result as the absolute values of the outputs are over 0.357 and 0.33 respectively.

The Fare is highly correlated with the Pclass, followed by SibSp and Parch. 

SibSp is also correlated with Parch

In [19]:
SIG = 0.05
MOD_SIG = 0.1

survival = train_df[train_df[LABEL] == 1]
death = train_df[train_df[LABEL] == 0]

In [22]:
# Find out which features can be assessed using ANOVA
for col in NUM_FEATURES:
    pop1 = survival[col]
    pop2 = death[col]
    stat1, p1 = normaltest(pop1)
    stat2, p2 = normaltest(pop2)
    if p1 > SIG and p2 > SIG:
        stat, p = bartlett(pop1, pop2)
        if p > SIG:
            print(col, "meets ANOVA assumptions")
        else:
            print(col, "--> Kruskal-Wallis, variance is unequal:", p)
    else:
        print(col,  "--> Kruskal-Wallis, not normally distributed:", p1, p2)


Pclass --> Kruskal-Wallis, not normally distributed: 0.0 4.465405162782553e-20
Age --> Kruskal-Wallis, not normally distributed: 0.08338285310821116 9.34541840078704e-15
SibSp --> Kruskal-Wallis, not normally distributed: 1.5799326846744402e-33 3.7991122810189133e-94
Parch --> Kruskal-Wallis, not normally distributed: 1.0694692584470984e-27 7.8105068599282235e-90
Fare --> Kruskal-Wallis, not normally distributed: 7.892442029445252e-68 1.4147916196682132e-118


In [23]:
for col in NUM_FEATURES:
    pop1 = survival[col]
    pop2 = death[col]
    stat, p = f_oneway(pop1, pop2)
    if p <= SIG:
        print(col, "and label are not independent - keep, p =", p)
    elif p <= MOD_SIG:
        print(col, "and label may have some relationship - maybe keep, p =", p)
    else:
        print(col, "and label are independent - drop, p =", p)

Pclass and label are not independent - keep, p = 7.776916288562695e-25
Age and label are not independent - keep, p = 0.037395225426059005
SibSp and label are independent - drop, p = 0.31067537341133067
Parch and label are not independent - keep, p = 0.013136766201093403
Fare and label are not independent - keep, p = 1.0797887540536882e-14


In [25]:
for col in FEATURES:
    pop1 = survival[col]
    pop2 = death[col]
    stat, p = kruskal(pop1, pop2)
    if p <= SIG:
        print(col, "and label are not independent - keep, p =", p)
    elif p <= MOD_SIG:
        print(col, "and label may have some relationship - maybe keep, p =", p)
    else:
        print(col, "and label are independent - drop, p =", p)

Pclass and label are not independent - keep, p = 1.0178648698620555e-23
Sex and label are not independent - keep, p = 1.3601519476403678e-58
Age and label are independent - drop, p = 0.2214856745252243
SibSp and label are not independent - keep, p = 0.006726926101742026
Parch and label are not independent - keep, p = 2.9711077819070433e-05
Ticket and label are not independent - keep, p = 1.9663317989359804e-06
Fare and label are not independent - keep, p = 1.191429108987401e-21
Embarked and label are not independent - keep, p = 1.319392291772178e-06


# Rescale and Dummy encode train_df

In [None]:
def rescale(x, MIN, MAX):
    return (x-MIN)/(MAX-MIN)

def rescale_df(df,num_features):
    for feature in num_features:
        min_val = df[feature].min()
        max_val = df[feature].max()
        df[feature] = df[feature].apply(lambda x: rescale(x,min_val, max_val))
    return df

In [None]:
rescaled_train_df = rescale_df(train_df, NUM_FEATURES)

preprocessed_df = pd.get_dummies(rescaled_train_df, columns= CAT_FEATURES)

display(preprocessed_df.head())
preprocessed_df.describe()