# Imports & configs

In [1]:
import warnings

# Data handling
import numpy as np
import pandas as pd

# Data visualizations
import seaborn as sns
import matplotlib as mpl
import plotly.express as px
import matplotlib.pyplot as plt

# Documentations
from IPython.display import Markdown, display

In [2]:
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
pd.options.display.max_columns = 999

# Data and initial lookup

Contents of `data` directory

- `sample_submission.csv`
- `test.csv`
- `train.csv`

Submission sample...

In [3]:
# pd.read_csv('../data/sample_submission.csv').head(3)
pd.read_csv('../input/spaceship-titanic/sample_submission.csv').head(3)

Reading the train and test datasets

In [4]:
# df = pd.read_csv('../data/train.csv')
# test_df = pd.read_csv('../data/test.csv')

df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

The training dataset, with the dependent variable (`Transported`)

In [5]:
df.head(3)

Training set without `Transported` (so one less column)

In [6]:
test_df.head(3)

In [7]:
print('Train dataset', df.shape,)
print('Test dataset', test_df.shape)

some basic stats...

In [8]:
df.describe()

Nulls, nulls, nulls

In [9]:
_, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))

sns.heatmap(pd.isna(df), ax=ax[0])
ax[0].set_title('Nulls in train dataset')

sns.heatmap(pd.isna(test_df), ax=ax[1])
ax[1].set_title('Nulls in test dataset')

plt.show()

Percentage of nulls in different columns..

First the training dataset

In [10]:
def show_null_perc(dataframe: pd.DataFrame, dfname: str) -> None:
    '''
    Displays the nulls as a markdown table
    '''
    strings = [
        f'<big><b>{dfname} dataset Nulls</b></big>',
        '',
        '|#|Column|Percentage of nulls|Type|',
        '|:|:-----|:------------------|:---|'
    ]

    for idx, c in enumerate(dataframe.columns, start=1):
        missing_c = dataframe[pd.isna(dataframe[c])]
        strings.append('|{}|{}|{}%|`{}`|'.format(
            idx, c, round((missing_c.shape[0] / dataframe.shape[0]) * 100, 3),
            'Continuous' if dataframe[c].nunique() > 10 else 'Categorical'
        ))

    display(Markdown('\n'.join(strings)))

In [11]:
show_null_perc(df, 'Train')

And the test dataset

In [12]:
show_null_perc(test_df, 'Test')

# Feature engineering

## `PassengerId`

A unique Id for each passenger. Each Id takes the form `gggg_pp` where `gggg` indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

In [13]:
# Looking at individual columns
df.PassengerId.head()
# This takes the form of gggg_pp,
# where gggg indicates a group the passenger is travelling with and `pp` is their
# number within the group.

In [14]:
df['GID'] = df.PassengerId.apply(lambda x: x.split('_')[0])
df['PID'] = df.PassengerId.apply(lambda x: x.split('_')[1])

In [15]:
test_df['GID'] = test_df.PassengerId.apply(lambda x: x.split('_')[0])
test_df['PID'] = test_df.PassengerId.apply(lambda x: x.split('_')[1])

## `LastName`

In [16]:
def get_last_name(name: str) -> str:
    if pd.isna(name): return ''
    else: return name.split(' ')[1]

In [17]:
df['LastName'] = df.Name.apply(lambda x: get_last_name(x))
test_df['LastName'] = test_df.Name.apply(lambda x: get_last_name(x))

## `Cabin`

The cabin number where the passenger is stayin. Takes the form `deck/num/side`, where side can be either `P` or Port or `S` for Starboard.

In [18]:
# Breaking cabin into deck, num & side

d, n, s = [], [], []
td, tn, ts = [], [], []

# Seperating the values from the training dataset
for _, row in df.iterrows():
    if pd.isna(row.Cabin):
        for l in (d, n, s):
            l.append(np.nan)
    else:
        x = row.Cabin.split('/')
        d.append(x[0])
        n.append(x[1])
        s.append(x[2])

# Seperating the values from the test dataset
for _, row in test_df.iterrows():
    if pd.isna(row.Cabin):
        for l in (td, tn, ts):
            l.append(np.nan)
    else:
        x = row.Cabin.split('/')
        td.append(x[0])
        tn.append(x[1])
        ts.append(x[2])

In [19]:
df['Deck'] = d; df['Num'] = n; df['Side'] = s

In [20]:
test_df['Deck'] = td; test_df['Num'] = tn; test_df['Side'] = ts

# Handling nulls

Now before we go ahead with our feature engineering we need to make sure we handle the nulls (our other independent variables) properly.

## Imports necessary

In [82]:
from sklearn.preprocessing import (LabelEncoder, 
                                   MinMaxScaler)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import (train_test_split, 
                                     KFold, cross_val_score)
from sklearn.metrics import (accuracy_score,
                             confusion_matrix,
                             classification_report,
                             precision_score,
                             recall_score,
                             f1_score,)
from sklearn.metrics import (mean_squared_error,
                             mean_absolute_error)
import statsmodels.api as sm
from typing import List, Dict

## Writing the imputation methods

Creating a dictionary containing label encoders, for all the columns we'd have to encode

In [22]:
def get_encoder_dictionary(df: pd.DataFrame, test_df: pd.DataFrame, encode_cols: List[str], **kwargs) -> Dict[str, LabelEncoder]:
    encoders: Dict[str, LabelEncoder] = {}
    
    # For every column we're fitting the encoder
    # with all non null values and save it in the
    # encoders dictionary with the key `column`
    for column in encode_cols:
        l = LabelEncoder(**kwargs)
        non_null_values: List[Any] = df[~pd.isna(df[column])][column].tolist()
        if column != 'Transported':
            non_null_values.extend(
                test_df[~pd.isna(test_df[column])][column].tolist()
            )
        l.fit(non_null_values)
        encoders[column] = l
    
    return encoders

Let's use the method to make sure everything is working as we want it too

In [23]:
# Columns we'd like to encode
encode_cols = ('HomePlanet', 'CryoSleep', 'Destination', 
               'VIP', 'Transported', 'Deck', 'Side', 
               'LastName', 'GID')

encoders = get_encoder_dictionary(df, test_df, encode_cols)

In [24]:
encoders

Now for the method that'll impute the missing values in the rows. Note that we're imputing values in the independent variables, so the performance of the models does not matter that much, so ideally to see how the models are doing we'll be including some visualizations as well

After analyzing the types of missing values there are usually 3 main ways to impute missing values in a dataset.

- Imputing with central tendencies (mean, median, mode etc)
- Dropping the nulls (Dropping the rows containing nulls outright)
- Predicting the values of the nulls using other independent variables in the dataset.

Usually the third one is not seen that often in the wild since it is a very involved process. However we know that a model seeing all the associations will predict better values for the nulls than us doing confiditional imputations

In [25]:
def test_model(model, X, y, ax, target, type_):
    (X_train, 
     X_test, 
     y_train, 
     y_test) = train_test_split(X, y, 
                                test_size=0.15, 
                                random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    if type_ == 'C':
        sns.heatmap(
            confusion_matrix(y_test, preds),
            annot=True, fmt='.0f', ax=ax,
            linewidth=2, cbar=False,
            annot_kws={'size': 15}
        )
        ax.set_title('[{}] Confusion-matrix (Accuracy: {})'.format(
            target, 
            round(accuracy_score(y_test, preds), 2), 
        ), fontsize=25)
    elif type_ == 'R':
        mse = mean_squared_error(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        sm.qqplot(y_test-preds, fit=True, line='45', ax=ax, markersize=10)
        ax.set_title('[{}] QQ-Plot (RMSE: {} - MAE: {})'.format(
            target, 
            round(np.sqrt(mse), 2), 
            round(mae, 2)
        ), fontsize=25)

In [26]:
def impute_with_model(
    df: pd.DataFrame,
    test_df: pd.DataFrame,
    target: str,
    encoders: Dict[str, LabelEncoder],
    exclude_cols: List[str],
    type_: str='C',
    test: bool=True,
    ax=None
) -> pd.DataFrame:
    
    # Exit condition
    if df[pd.isna(df[target])].shape[0] == 0:
        return (df, test_df)
    
    model = DecisionTreeClassifier() if type_ == 'C' else LinearRegression()
    
    # Columns that're to be used for this model
    include_cols = [x for x in df.columns if x not in exclude_cols]
    
    # Making a copy of the dataframe
    temp = df[include_cols].copy(deep=True)
    temp_test = test_df[include_cols].copy(deep=True)
    
    # Dropping the nulls
    temp = temp.dropna()
    temp_test = temp_test.dropna()
    
    for c in temp.columns:
        if c in encoders:
            temp[c] = encoders[c].transform(temp[c])
            temp_test[c] = encoders[c].transform(temp_test[c])
    
    # Train using these
    X, y = (
        temp[[x for x in df.columns if x not in exclude_cols + [target]]],
        temp[target]
    )
    
    # Predict these later on
    X_test = temp_test[X.columns]
    
    if test and ax:
        test_model(model, X, y, ax, target, type_)
    
    # Final training with the entire dataset
    model.fit(X, y)
    
    # Processing the entire dataframe for prediction
    X_new = df[pd.isna(df[target])][X.columns].dropna()
    X_test_new = test_df[pd.isna(test_df[target])][X.columns].dropna()
    
    # Encoding the values for prediction
    for column, encoder in encoders.items():
        if column in X_new.columns:
            X_new[column] = encoder.transform(X_new[column])
            X_test_new[column] = encoder.transform(X_test_new[column])
    
    # Making predictions
    preds = model.predict(X_new)
    test_preds = model.predict(X_test_new)
    
    if type_ == 'C':
        labels = encoders[target].inverse_transform(preds)
        test_labels = encoders[target].inverse_transform(test_preds)
    else:
        labels = df[target].tolist()
        test_labels = test_df[target].tolist()
    
    # Adding the predictions to respective columns
    idx_preds_mapping: List[Tuple[int, str]] = list(zip(X_new.index.tolist(), labels))
    test_idx_preds_mapping = list(zip(X_test_new.index.tolist(), test_labels))
    
    for idx, p in idx_preds_mapping:
        df.loc[df.index==idx, target] = p

    for idx, p in test_idx_preds_mapping:
        test_df.loc[test_df.index==idx, target] = p

    x = df[target].mode().iloc[0] if type_ == 'C' else df[target].median()
    df.loc[pd.isna(df[target]), target] = x
        
    x = test_df[target].mode().iloc[0] if type_ == 'C' else test_df[target].median()
    test_df.loc[pd.isna(test_df[target]), target] = x
    
    return df, test_df

In [27]:
# Columns to exclude

exclude_columns = ['PassengerId', 'Cabin', 'PID', 'Name', 'Transported', 'GID']

In [28]:
encoders

In [29]:
NROWS, NCOLS = 5, 2
_, ax = plt.subplots(nrows=NROWS, ncols=NCOLS, figsize=(20, 35))

cols_to_impute = [
    ('HomePlanet', 'C'), ('CryoSleep', 'C'), ('Destination', 'C'),
    ('Age', 'R'), ('VIP', 'C'), ('RoomService', 'R'), ('FoodCourt', 'R'), 
    ('ShoppingMall', 'R'), ('Spa', 'R'), ('VRDeck', 'R')
]

idx = 0
for i in range(NROWS):
    for j in range(NCOLS):
        col, type_ = cols_to_impute[idx]
        df, test_df = impute_with_model(
            df, test_df,
            col, 
            encoders, 
            exclude_columns,
            type_=type_,
            test=True,
            ax=ax[i][j]
        )
        idx += 1

plt.suptitle('Imputation model performances', x=.5, y=1.02, fontsize=30)
plt.tight_layout()
plt.show()

In [31]:
df.loc[pd.isna(df.Deck), 'Deck'] = df.Deck.mode().iloc[0]
df.loc[pd.isna(df.Num), 'Num'] = df.Num.mode().iloc[0]
df.loc[pd.isna(df.Side), 'Side'] = df.Side.mode().iloc[0]

In [32]:
test_df.loc[pd.isna(test_df.Deck), 'Deck'] = test_df.Deck.mode().iloc[0]
test_df.loc[pd.isna(test_df.Num), 'Num'] = test_df.Num.mode().iloc[0]
test_df.loc[pd.isna(test_df.Side), 'Side'] = test_df.Side.mode().iloc[0]

In [33]:
_, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))

sns.heatmap(pd.isna(df), ax=ax[0])
ax[0].set_title('Nulls in train dataset', fontsize=30)
ax[0].set_xticklabels(ax[0].get_xticklabels(), fontsize=20)

sns.heatmap(pd.isna(test_df), ax=ax[1])
ax[1].set_title('Nulls in test dataset', fontsize=30)
ax[1].set_xticklabels(ax[1].get_xticklabels(), fontsize=20)


plt.show()

# Feature engineering

## Creating new attribute `TotalSpent`

Using the attributes `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck` to compute the `TotalSpent` values

In [34]:
df['TotalSpent'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].apply(
    lambda x: sum(x),
    axis=1
)

test_df['TotalSpent'] = test_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].apply(
    lambda x: sum(x),
    axis=1
)

In [35]:
df.head(3)

# Encoding the categorical columns

In [36]:
for attribute in encoders:
    df[attribute] = encoders[attribute].transform(df[attribute])

In [37]:
for attribute in encoders:
    if attribute == 'Transported':
        continue
    test_df[attribute] = encoders[attribute].transform(test_df[attribute])

# Correlation

In [38]:
df_corr = df.corr()

In [39]:
corr = np.corrcoef(np.random.randn(*list(df_corr.shape)))
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), square=True, linewidths=0, vmax=.3, annot=True, mask=mask)

plt.title('Correlation among the attributes')
plt.show()

In [40]:
plt.figure(figsize=(10, 6))
df_corr.Transported.drop('Transported').plot(kind='bar')
plt.show()

# Final preprocessing

In [61]:
drop_cols = ['PassengerId', 'Cabin', 'Name',]

X = df.drop(drop_cols + ['Transported'], axis=1).copy(deep=True)
y = df.Transported

test = test_df.drop(drop_cols, axis=1).copy(deep=True)

## Scaling the dataframes

In [91]:
scaler = MinMaxScaler()
scaler = scaler.fit(X)

# Scaling the dataframes
X = pd.DataFrame(scaler.transform(X), columns=X.columns)
test = pd.DataFrame(scaler.transform(test), columns=X.columns)

## Train test split

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train.shape, y_test.shape, test.shape

# Methods

In [93]:
# Method

performances = []

def show_confusion_matrix(conf_matrix, **kwargs):
    plt.figure(figsize=(7, 7))
    sns.set(font_scale=1.7)
    sns.heatmap(conf_matrix, annot=True, cbar=False, fmt='.0f', **kwargs)
    sns.set(font_scale=1)

def getModelReport(model, X_train, y_train, X_test, y_test, msg=np.nan):

    preds = model.predict(X_test)
    accuracy = accuracy_score(preds, y_test)
    precision = precision_score(preds, y_test)
    recall = recall_score(preds, y_test)
    f1 = f1_score(preds, y_test)
    conf_matrix = confusion_matrix(preds, y_test)
    clf_report = classification_report(preds, y_test)
    model_class = model.__class__.__name__
    
    # Saving the performances
    performances.append(
        [model_class, accuracy, f1, msg]
    )
    
    # Creating Cross validation scores
    kfold_vc = KFold(n_splits=10)
    results = cross_val_score(model, X_train, y_train, cv=kfold_vc)
    
    print('*'*60)
    print('{}'.format(model_class).rjust(40, ' '))
    print('*'*60)
    print('ACCURACY K-Fold:', results.mean())
    print('ACCURACY:', accuracy)
    print('F1 SCORE:', f1)
    print('CLASSIFICATION REPORT: ')
    print(clf_report)
    print('*'*60)
    
    show_confusion_matrix(conf_matrix)
    plt.title('Confusion matrix of {}'.format(model_class))
    plt.show()

# GradientBoostingClassifier

In [94]:
from sklearn.ensemble import GradientBoostingClassifier

In [95]:
grad = GradientBoostingClassifier()
grad = grad.fit(X_train, y_train)

In [96]:
getModelReport(grad, X_train, y_train, X_test, y_test)

# Submission

Training with the full dataset

In [97]:
grad = grad.fit(X, y)

In [98]:
test_df['Transported'] = encoders['Transported'].inverse_transform(
    grad.predict(test)
)
test_df[['PassengerId', 'Transported']].to_csv('submission.csv', index=False)