# Presets

In [155]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from IPython.display import display, HTML
 

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFECV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

In [121]:
class DataFrameExtended(pd.DataFrame):
    def basic_descriptives(self):
        display(HTML("<p>Head:</p>"))
        display(HTML(self.head().to_html()))
        display(HTML("<p>Summary stats:</p>"))
        display(HTML(self.describe().to_html()))
        print(
            "\nNrows: ", self.shape[0], "\n",
            "\nNcols: ", self.shape[1], "\n",
            "\nData types:\n", self.dtypes.to_string(), "\n",
            sep=''
        )
    def show_basic_plots(self, vars_subset=None):
        # Presets:
        if vars_subset is None:
            vars_subset = self.columns.tolist()
    
        color_background = '#F5F5F5'
        color_gridlines = '#DCDCDC'
        colors_in_use = ['#2C3E50', '#537EA2', '#858F84', '#42A593',
                         '#873E23', '#CFD1A1', '#6A744F', '#BDBDC5',
                         '#7EA253', '#EDB676', '#C26D40']+px.colors.qualitative.Safe
    
        dtypes_num = ['int64', 'int32', 'int16', 'float64', 'float32', 'float16']
        dtypes_str = ['object', 'category']
        vars_num = self.loc[:, vars_subset].select_dtypes(include=dtypes_num).columns
        vars_str = self.loc[:, vars_subset].select_dtypes(include=dtypes_str).columns
    
        # For categorical variables:
        fig_str_lst = [None]*len(vars_str)
        for i in range(0, len(vars_str)):
            fig = go.Figure()
            fig.add_trace(go.Histogram(x=self.loc[:, vars_str[i]],
                                       name=vars_str[i],
                                       showlegend=True))
            fig.update_traces(marker_color=colors_in_use[0],
                              marker_line_color='rgb(8,48,107)',
                              marker_line_width=1.5,
                              opacity=0.8)
            fig.update_layout(xaxis_type='category',
                              xaxis_title=vars_str[i],
                              paper_bgcolor=color_background,
                              plot_bgcolor=color_background)
            fig.update_yaxes(gridcolor=color_gridlines)
            fig.update_xaxes(linecolor=color_gridlines)
            fig_str_lst[i] = fig
        
        # For 2 categorical variables:
        fig_str_lst_mix = []
        for i in range(0, len(vars_str)):
            for j in range(0, len(vars_str)):
                if i == j:
                    continue
                else:
                    fig = px.histogram(self, x=vars_str[i], color=vars_str[j],
                                       color_discrete_sequence=colors_in_use[1:])
                    fig.update_traces(marker_line_color='rgb(8,48,107)',
                                      marker_line_width=1.5,
                                      opacity=0.8)
                    fig.update_layout(xaxis_type='category',
                                      xaxis_title=vars_str[i],
                                      paper_bgcolor=color_background,
                                      plot_bgcolor=color_background)
                    fig.update_yaxes(gridcolor=color_gridlines, title='')
                    fig.update_xaxes(linecolor=color_gridlines)
                    fig_str_lst_mix.append(fig)
            
        # For numerical variables:
        fig_num_lst_dist = [None]*len(vars_num)
        for i in range(0, len(vars_num)):
            fig = ff.create_distplot(hist_data=[self[vars_num[i]].dropna()],
                                     group_labels=[vars_num[i]],
                                     show_hist=False,
                                     show_rug=False,
                                     colors = colors_in_use)
            fig.update_layout(paper_bgcolor=color_background,
                              plot_bgcolor=color_background)
            fig.update_yaxes(gridcolor=color_gridlines,
                             zerolinecolor=color_gridlines,
                             title='')
            fig.update_xaxes(gridcolor=color_gridlines,
                             title=vars_num[i])
            fig_num_lst_dist[i] = fig
        
        fig_num_lst_violin = [None]*len(vars_num)
        for i in range(0, len(vars_num)):
            fig = px.violin(self, y=vars_num[i], box=True, points='outliers')
            fig.update_traces(marker_color = colors_in_use[0],
                              opacity=0.8,
                              name=vars_num[i],
                              showlegend=True)
            fig.update_layout(paper_bgcolor=color_background,
                              plot_bgcolor=color_background)
            fig.update_yaxes(gridcolor=color_gridlines,
                             zerolinecolor=color_gridlines,
                             title='')
            fig.update_xaxes(title=vars_num[i])
            fig_num_lst_violin[i] = fig
    
        # For 2 numerical variables:
        fig_num_lst_mix = []
        fig_num_lst_scat = []
        for i in range(0, len(vars_num)):
            for j in range(0, len(vars_num)):
                if i == j:
                    continue
                else:
                    fig = go.Figure()
                    fig.add_trace(go.Histogram2dContour(x = self[vars_num[i]],
                                                        y = self[vars_num[j]],
                                                        colorscale='deep'))
                    fig.update_layout(paper_bgcolor=color_background,
                                      plot_bgcolor=color_background)
                    fig.update_yaxes(title=vars_num[j])
                    fig.update_xaxes(title=vars_num[i])
                    fig_num_lst_mix.append(fig)
                
                    fig = px.scatter(data,x=data[vars_num[i]],
                                     y=data[vars_num[j]],
                                     trendline='ols')
                    fig.update_layout(paper_bgcolor=color_background,
                                      plot_bgcolor=color_background)
                    fig.update_traces(marker_color=colors_in_use[0],
                                      opacity=0.8)
                    fig.update_yaxes(gridcolor=color_gridlines,
                                     zerolinecolor=color_gridlines,
                                     title=vars_num[j])
                    fig.update_xaxes(gridcolor=color_gridlines,
                                     zerolinecolor=color_gridlines,
                                     title=vars_num[i])
                    fig_num_lst_scat.append(fig)

        # For 1 categorical, 2 numeric: -- to do
        # For 2 categorical, 1 numeric: -- to do
        # For 2 categorical, 2 numeric: --- to do
        
        # 1 categorical, 1 numeric:
        fig_all_violin_mix = []
        for i in range(0, len(vars_str)):
            for j in range(0, len(vars_num)):
                fig = px.violin(self, y=vars_num[j], color=vars_str[i],
                                color_discrete_sequence=colors_in_use,
                                box=True, points='outliers')
                fig.update_traces(opacity=0.8)
                fig.update_layout(xaxis_title=vars_num[j],
                                  showlegend=True,
                                  paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_yaxes(gridcolor=color_gridlines, title='')
                fig.update_xaxes(linecolor=color_gridlines)
                fig_all_violin_mix.append(fig)
            
        final_dict = {
            # categorical+target
            'dist_cat': fig_str_lst,
            'dist_mlt_cat': fig_str_lst_mix,
            # numeric+target
            'dist_num': fig_num_lst_dist,
            'violin_num': fig_num_lst_violin,
            'scatter_num': fig_num_lst_scat,
            'dist_mlt_num': fig_num_lst_mix,
            # all
            'violin_mix': fig_all_violin_mix
            #'scatter_mix': None,
            #'dist_mlt_mix': None,
        }
        return(final_dict)
    def corr_heatmap(self):
        color_background = '#F5F5F5'
        color_gridlines = '#DCDCDC'
        
        fig = px.imshow(self.corr().round(3), text_auto=True, color_continuous_scale='deep')
        fig.update_traces(opacity=0.8)
        fig.update_layout(coloraxis_showscale=False,
                          paper_bgcolor=color_background,
                          plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines, title='')
        fig.update_xaxes(linecolor=color_gridlines)
        return(fig)

In [79]:
data = pd.read_csv('train.csv')
data = DataFrameExtended(data)

# Initial data exploration

In [5]:
data.basic_descriptives()  

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292



Nrows: 891

Ncols: 12

Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object



In [6]:
tmp = data.show_basic_plots()

In [12]:
print(tmp.keys()) # options what to show

dict_keys(['dist_cat', 'dist_mlt_cat', 'dist_num', 'violin_num', 'scatter_num', 'dist_mlt_num', 'dist_mix', 'violin_mix', 'bar_mix', 'scatter_mix', 'dist_mlt_mix', 'heatmap'])


In [19]:
for i in tmp['dist_num']: 
    i.show()

In [None]:
fig.update_traces(marker_color=colors_in_use[0],
                        marker_line_color='rgb(8,48,107)',
                          marker_line_width=1.5,
                          opacity=0.8)
    fig.update_layout(xaxis_type='category',
                      xaxis_title=vars_str[i],
                          paper_bgcolor=color_background,
                          plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines)
        fig.update_xaxes(linecolor=color_gridlines)
        fig_str_lst[i] = fig

Main technical clues here:
- Name, PassengerId are rather unimportant as predictors (or at least it is hard to see some reasons to think so)
- Pclass, Survided, Parch, SibSp are actually character variables
- Distribuion of Cabins and Tickets are almost unimodal

From the analytical perspective:
- Most of passangers embarked in S (Southampton)
- There were more men than women on the board
- Fare have some big outliers
- Age have a lot of missings

So, recode & drop to explore better:

In [17]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# EDA continuation

In [122]:
data_cleared = data.copy()
data_cleared = DataFrameExtended(data_cleared)
data_cleared['Survived'] = data_cleared.Survived.astype(str)
data_cleared['Pclass'] = data_cleared.Pclass.astype(str)
data_cleared['SibSp'] = data_cleared.SibSp.astype(str)
data_cleared['Parch'] = data_cleared.Parch.astype(str)
data_cleared.drop(columns=['PassengerId', 'Cabin', 'Ticket', 'Name'], inplace=True)

In [23]:
data_cleared.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [114]:
data_cleared.corr_heatmap()

In [123]:
data_corr = data_cleared
data_corr = DataFrameExtended(data_corr)
data_corr['Survived'] = data_corr.Survived.astype(np.int64)
data_corr['Pclass'] = data_corr.Pclass.astype(np.int64)
data_corr['SibSp'] = data_corr.SibSp.astype(np.int64)
data_corr['Parch'] = data_corr.Parch.astype(np.int64)
data_corr.corr_heatmap()

In [103]:
data_cleared_plots = data_cleared.show_basic_plots()
print(data_cleared_plots.keys())

dict_keys(['dist_cat', 'dist_mlt_cat', 'dist_num', 'violin_num', 'scatter_num', 'dist_mlt_num', 'violin_mix', 'bar_mix', 'scatter_mix', 'dist_mlt_mix', 'heatmap'])


In [104]:
for i in data_cleared_plots['violin_mix']: 
    i.show()

Main clues:
- Most passengers travelled 3rd class
- Mostly, passengers travelled alone (no siblings, spouces) or maximum with 1 spouce/child. Other examples are rather outliers that (maybe) can be grouped
- Among the survived, the proportion of women was much bigger. Moreover, the proportion of survation between Pclass also differs
- Fare/Age distribution almost uniform
- Fare of people survived was bigger
- More uniform distribution of Age in the 1st class, mean Age is also upper
- Numeric variables are not correlated between each other
- Parch & SibSp are pretty much correlated

# Features preparation

In [145]:
dtypes_num = ['int64', 'int32', 'int16', 'float64', 'float32', 'float16']
dtypes_str = ['object', 'category']
numeric_features = data_cleared.select_dtypes(include=dtypes_num).columns.to_list()
categorical_features = data_cleared.select_dtypes(include=dtypes_str).columns.to_list()

In [158]:
print(numeric_features)
print(categorical_features)

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
['Sex', 'Embarked']


In [150]:
# add 0-1 column indicating NA in Age -- as NA may potentially indicate not survived only
data_cleared['AgeNA'] = np.where(np.isnan(data_cleared['Age']), 1, 0)

In [153]:
# Standardize and impute NA with median, one-hot, unite for all
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [None]:
X = data_cleared.drop(columns=['Survived'])
y = data_cleared['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train.head()

In [None]:
print(y_train)

# Feature importance

In [126]:
model_rf = RandomForestRegressor(random_state=0)
model_rf.fit(X_train, y_train)

ValueError: could not convert string to float: 'male'

In [None]:
f_i = list(zip(features,rf.feature_importances_))
f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
plt.show()

# Train-test aproach for basic models

In [None]:
# logit
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

In [None]:
# decision tree
# random forests
# xgboost
# bart
# SVM
# 

# Cross-validation approach for basic models

# Outliers detection

# Stacking (ensembling)

# Neural networks

# Final model and prediction on new data (for Kaggle submission)