### Import libraries

In [2]:
import pandas as pd
import numpy as np
import os

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

In [5]:
if os.name == 'nt':
    data_path = ""

else:
    data_path = "/Users/admin/_Work/Data/Practice/titanic/"

In [6]:
train = pd.read_csv(os.path.join(data_path,"train.csv"))
test = pd.read_csv(os.path.join(data_path, "test.csv"))

In [3]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [5]:
sum((train.Sex =='female'))

314

In [6]:
def missing_percentage(df, show=True):
    total = df.isnull().sum().sort_values(ascending= False)
    percent = round(total / len(df)* 100, 2)

    return_df = pd.concat([total, percent], axis = 1, keys=['Total', 'Percent'])
    if show: print(return_df)

    return return_df

def percent_value_count(df, feature):
    total = df[feature].value_counts(dropna=False)
    percent = round(df[feature].value_counts(dropna= False, normalize=True) * 100, 2)

    return_df = pd.concat([total, percent], axis=1, keys=['Total', ' Percent'])

    return return_df

_ = missing_percentage(train)
_ = missing_percentage(test)


             Total  Percent
Cabin          687    77.10
Age            177    19.87
Embarked         2     0.22
PassengerId      0     0.00
Survived         0     0.00
Pclass           0     0.00
Name             0     0.00
Sex              0     0.00
SibSp            0     0.00
Parch            0     0.00
Ticket           0     0.00
Fare             0     0.00
             Total  Percent
Cabin          327    78.23
Age             86    20.57
Fare             1     0.24
PassengerId      0     0.00
Pclass           0     0.00
Name             0     0.00
Sex              0     0.00
SibSp            0     0.00
Parch            0     0.00
Ticket           0     0.00
Embarked         0     0.00


## Start filling empty value
Begin withh: 
- Embarked
- Age
- Cabin

### Working on Embarked

In [7]:
train['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [8]:
percent_value_count(train, 'Embarked')

Unnamed: 0,Total,Percent
S,644,72.28
C,168,18.86
Q,77,8.64
,2,0.22


In [9]:
train[train.Embarked.isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [10]:
fig = make_subplots(rows=1, cols=2, subplot_titles=["Training set", "Test set"])

df = [train, test]

for i in range(len(df)):
    for t in px.box(df[i].sort_values(by=['Embarked','Pclass']), x = 'Embarked', y = 'Fare', color = 'Pclass').data:
        fig.add_trace(t, row=(i//2) + 1, col=(i%2) + 1)

fig.update_layout(height=500, width=800 ,boxmode='group', yaxis_title= "Fare ($)"
).update_traces(
    showlegend=False, selector=lambda t: "Fare" not in t.hovertemplate)

fig.show()

## -> Fill missing Embraked value as C

In [11]:
train.Embarked.fillna('C', inplace=True)

## Working on Cabin

In [12]:
print(train.Cabin.isna().sum()/ len(train))
print(test.Cabin.isna().sum()/ len(test))

0.7710437710437711
0.7822966507177034


In [13]:
train.Cabin.sample(10)

836        NaN
715      F G73
773        NaN
435    B96 B98
723        NaN
710        C90
777        NaN
466        NaN
719        NaN
324        NaN
Name: Cabin, dtype: object

## Merge train and test cabin

In [14]:
train_prefix = train.drop(columns=['Survived'])
Survived = train['Survived']
all_data = pd.concat([train_prefix, test], axis=0)
all_data.Cabin.fillna('N', inplace=True)
all_data.Cabin = [i[0] for i in all_data.Cabin]
all_data.reset_index(inplace=True, drop=True)

In [15]:
all_data[889:893]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C,C
890,891,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,N,Q
891,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,N,Q
892,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,N,S


## From Pclass, Fare, Embarked to Cabin

In [16]:
not_na_cabin = all_data[all_data['Cabin'] != "N"]
na_cabin = all_data[all_data['Cabin'] == "N"]
not_na_cabin.reset_index(inplace=True, drop=True)
# print(not_na_data.loc[np.where(not_na_data['Fare'] == 0)[0]])

means = not_na_cabin.groupby("Cabin")['Fare'].mean()
std = not_na_cabin.groupby("Cabin")['Fare'].std()
upper = means + std
lower = means - std
cabins = pd.concat([std, means, lower, upper], axis=1, keys=['std', 'mean', 'lower', 'upper'])
cabins.dropna(inplace=True)
cabins

Unnamed: 0_level_0,std,mean,lower,upper
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,20.140358,41.244314,21.103956,61.384671
B,115.312993,122.383078,7.070085,237.696072
C,72.912034,107.926598,35.014564,180.838632
D,28.126283,53.007339,24.881056,81.133622
E,37.738225,54.564634,16.82641,92.302859
F,12.215124,18.079367,5.864242,30.294491
G,3.416419,14.205,10.788581,17.621419


In [17]:
def cabin_estimator(i, cabin_df):
    # Check from lower boundary
    try:
        lower_b = (i - cabin_df['lower']) > 0 # list(bool)
        mean_b = (cabin_df['mean'] - i) > 0
        upper_b = (cabin_df['upper'] - i) > 0
        
        # If in lower boundary
        in_lower = lower_b & mean_b
        # If in upper boundary
        in_upper = upper_b & ~mean_b

        # Process in lower
        if sum(in_lower) >= 1:
            close_to_mean = abs(cabin_df[in_lower]['mean'] - i)
            chosen_cabin = close_to_mean.idxmin()

        # Process out of lower
        else:
            chosen_cabin = 'G'

        # Process in upper
        if sum(in_upper) >= 1:
            close_to_mean = abs(cabin_df[in_upper]['mean'] - i)
            chosen_cabin = close_to_mean.idxmin()

        # Process out of upper
        else:
            chosen_cabin = 'B'

    except:
        print(sum(lower_b))
        print(i)

    return chosen_cabin

In [18]:
na_cabin['Cabin'] = na_cabin.Fare.apply(lambda x: cabin_estimator(x, cabins))

all_data = pd.concat([na_cabin, not_na_cabin], axis=0)
all_data.sort_values(by='PassengerId', inplace=True)
all_data.reset_index(inplace=True, drop=True)

train = all_data.loc[:890]
test = all_data.loc[890:]
train.loc[:,'Survived'] = Survived



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [19]:
missing_value = all_data[
    (all_data.Pclass == 3) &
    (all_data.Sex == 'male') &
    (all_data.Embarked =='S') &
    (all_data.Cabin == 'B')
    ].Fare.mean()
missing_value

9.300610652920966

In [20]:
test.Fare.fillna(missing_value, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [21]:
sum((train.Sex =='female') * (train.Survived == 1))

233

## Gender and Survived

In [26]:
fig = px.histogram(train, x='Sex',
                    # y ='Survived',
                    pattern_shape='Survived',
                    color='Sex',
                    barmode="relative",
                    # histnorm='probability',
                    # barnorm='percent',
                    orientation='v',
                    category_orders={'Sex':['female', 'male']})

fig2 = fig.update(barnorm='percent')

fig.show()
fig2.show()

KeyError: 'barnorm'

In [None]:
fig = px.

fig.show()