# Analysing the titanic dataset

1. Basic data cleaning and exploration
2. Exploratory data analysis
3. Model Experimentation

In [1]:
# import relevant packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from IPython.display import display, HTML

## Basic data exploration

1. Import the data
2. Look at summary statistics
3. Evaluate null values if any


In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
# Function to create scrollable table within a small window
def create_scrollable_table(df, table_id, title):
    html = f'<h3>{title}</h3>'
    html += f'<div id="{table_id}" style="height:200px; overflow:auto;">'
    html += df.to_html()
    html += '</div>'
    return html

In [4]:
df_head = train_df.head()
html_df_head = create_scrollable_table(df_head, 'df_head', 'View the structure of data in the dataframe ')

display(HTML(html_df_head))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### Now check for NULL values

In [6]:
# null values in each column
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
# percentage null values

round((train_df.isna().sum() / len(train_df)) * 100, 2)

PassengerId     0.00
Survived        0.00
Pclass          0.00
Name            0.00
Sex             0.00
Age            19.87
SibSp           0.00
Parch           0.00
Ticket          0.00
Fare            0.00
Cabin          77.10
Embarked        0.22
dtype: float64

We will not use the age column and cabin column for this analysis since their proportion of null values is large

In [8]:
train_df.shape 

(891, 12)

In [9]:
# # view the distribution of the data
# fig0 =px.histogram(train_df, x='Survived', nbins=5)
# fig0.show()

## What questions do I want to ask of the data

1. How passenger class correlates with survival rate
2. How ports of embarkation correlates with survival rate
3. Distribution of male and female
4. How age affected survival 
5. How passenger class relates with cabin

In [10]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [11]:
def go_data(data, label, colour):
    bar_obj = go.Bar(
        x=data.index,
        y=data.values,
        name=label,
        marker=dict(color=colour),
        text=data.values,
        textposition='outside',
        textfont=dict(color='white')
    ) 
    return bar_obj

def go_bar_layout(title, xlabel, ylabel):
    layout = go.Layout(
        title=title,
        xaxis=dict(title=xlabel),
        yaxis=dict(title=ylabel),
        plot_bgcolor='rgba(34, 34, 34, 1)',
        paper_bgcolor='rgba(34, 34, 34, 1)',
        font=dict(color='white')
    )
    return layout

In [12]:
# 1. How passenger class corelates with survival

# Calculate the counts for each class and survival status
survived_counts = train_df[train_df['Survived']==1]['Pclass'].value_counts()
not_survived_counts = train_df[train_df['Survived']==0]['Pclass'].value_counts()

# Define the data as a list of traces
data = [go_data(survived_counts, 'Survived', 'green'), go_data(not_survived_counts, 'Not Survived', 'orange')]

# Define the layout
layout =go_bar_layout('Survival by Passenger Class', 'Passenger Class', 'Count')

# Create the figure object
fig1 = go.Figure(data=data, layout=layout)

# Show the figure
fig1.show()

In [13]:
# 2. How ports of embarkation correlates with survival rate

# Define the data as a list of traces
embarked_survived = train_df[train_df['Survived']==1]['Embarked'].value_counts()
embarked_not_survived = train_df[train_df['Survived']==0]['Embarked'].value_counts()

data = [go_data(embarked_survived, 'Survived', 'green'), go_data(embarked_not_survived, 'Not Survived', 'orange')]


# Define the layout
layout = go_bar_layout('Survival by port of Embarkation', 'Embarkation port', 'Count')

# Create the figure object
fig2 = go.Figure(data=data, layout=layout)

# Show the figure
fig2.show()

In [14]:
#  3. Distribution of male and female
age_df = train_df.groupby(['Age', 'Survived'], as_index=False).size()
print(age_df)

# creating the scatter plot
fig3 =px.scatter(age_df, x='Age', y='size', color='Survived', title='Relation between age and suvival', color_continuous_scale=['green', 'orange'])
fig3.update_layout(plot_bgcolor='rgb(30,30,30)', paper_bgcolor='rgb(30,30,30)', font=dict(color='white'))

fig3.show()

       Age  Survived  size
0     0.42         1     1
1     0.67         1     1
2     0.75         1     2
3     0.83         1     2
4     0.92         1     1
..     ...       ...   ...
137  70.00         0     2
138  70.50         0     1
139  71.00         0     2
140  74.00         0     1
141  80.00         1     1

[142 rows x 3 columns]


In [15]:
from scipy.stats import chi2_contingency

# 5. How passenger class relates with cabin
train_df['Pclass_cat'] = train_df['Pclass'].astype('category')
train_df['Cabin'] = train_df['Cabin'].astype('category')

# Create a contingency table of the two categorical variables
cont_table = pd.crosstab(train_df['Pclass'], train_df['Cabin'])

# Perform the chi-square test
chi2, pval, dof, expected = chi2_contingency(cont_table)


print('Chi-square statistic:', chi2)
print('P-value:', pval)

# Create the heatmap

# fig, ax = plt.subplots(figsize=(20, 10))
# sns.heatmap(cont_table, cmap='Blues', ax=ax)
# plt.show()
# pclass_cabin_corr = train_df['Pclass'].corr(train_df['Cabin'])
# print(f'Correlation between passenger class and cabin: {pclass_cabin_corr}')

# # Create a scatter plot to visualize the relationship between Living Area and Sale Price
# fig4 = px.scatter(train_df, x='GrLivArea', y='SalePrice', title='Living Area (above grade) vs Sale Price', color='GrLivArea', color_continuous_scale=px.colors.sequential.Purp) 

# fig4.update_layout(plot_bgcolor='rgb(30,30,30)', paper_bgcolor='rgb(30,30,30)', font=dict(color='white'))

# fig4.show()


Chi-square statistic: 408.00000000000006
P-value: 8.266465537786716e-06


## Creating Data Pipeline 

In [16]:
# Importing useful models
from sklearn.impute import SimpleImputer

# defining numeric and categorical imputers
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')

# Update categorical and numerical columns
categorical_columns = train_df.select_dtypes(include=['object', 'category']).columns
numerical_columns = train_df.select_dtypes(include=['int64', 'float64']).columns

# Impute categorical and numeric columns separately
df_num = num_imputer.fit_transform(train_df[numerical_columns])
df_cat = cat_imputer.fit_transform(train_df[categorical_columns])

df_num = pd.DataFrame(df_num, columns=numerical_columns)
df_cat = pd.DataFrame(df_cat, columns=categorical_columns)

# Combine the imputed categorical and numeric columns
train_imputed_df = pd.concat([df_num, df_cat], axis=1)

# Splitting features and labels
X = train_imputed_df.drop(['Survived'], axis=1)
y = train_imputed_df['Survived']
print(X.head())

   PassengerId  Pclass   Age  SibSp  Parch     Fare  \
0          1.0     3.0  22.0    1.0    0.0   7.2500   
1          2.0     1.0  38.0    1.0    0.0  71.2833   
2          3.0     3.0  26.0    0.0    0.0   7.9250   
3          4.0     1.0  35.0    1.0    0.0  53.1000   
4          5.0     3.0  35.0    0.0    0.0   8.0500   

                                                Name     Sex  \
0                            Braund, Mr. Owen Harris    male   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2                             Heikkinen, Miss. Laina  female   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   
4                           Allen, Mr. William Henry    male   

             Ticket    Cabin Embarked Pclass_cat  
0         A/5 21171  missing        S          3  
1          PC 17599      C85        C          1  
2  STON/O2. 3101282  missing        S          3  
3            113803     C123        S          1  
4            373450  missing 

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False))
])

In [18]:
# Update categorical and numerical columns
categorical_columns = train_df.select_dtypes(include=['object', 'category']).columns
numerical_columns = train_df.select_dtypes(include=['int64', 'float64']).columns

# Remove target variable from numerical columns
numerical_columns = numerical_columns.drop('Survived')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)])

# Apply the pipeline to your dataset
X = train_df.drop('Survived', axis=1)
y = train_df['Survived'] 
X_preprocessed = pipeline.fit_transform(X)


`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.



### Fit and Tune parameters of model

In [19]:
# Importing useful models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

models = {
    'LinearRegression': LinearRegression(),
    'RandomForestClassifier': RandomForestClassifier(random_state=34),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=34), 
}

param_grids ={
    'LinearRegression': {},
    'RandomForestClassifier': {
        'n_estimators': [100, 200, 500, 600],
        'max_depth': [None, 10, 20, 30]
    },
    'GradientBoostingClassifier': {
        'learning_rate': [0.001, 0.01, 0.1, 0.7],
        'max_depth': [None, 10, 20, 30],
        'n_estimator': [100, 200, 500, 600],
    }
}

# 3-fold cross-validation
cv = KFold(n_splits=3, shuffle=True, random_state=42)