**In this notebook, I'll be tackling the Spaceship Titanic problem, which involves predicting which passengers were transported to an alternate dimension after the spaceship collided with a spacetime anomaly. The goal is to help rescue crews and retrieve the lost passengers, using records recovered from the spaceship's damaged computer system.**

# **<span style="color:#4B0082;">Library import, data reading and basic data information</span>**

In [320]:
import time
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from fancyimpute import KNN
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from lazypredict.Supervised import LazyClassifier
from sklearn.ensemble import VotingClassifier

In [321]:
import warnings
warnings.filterwarnings('ignore')

In [322]:
sns.set()
sns.set(rc = {'figure.figsize':(20,10)})
# sns.set_palette("PiYG")
sns.set_style("whitegrid")
colors = [matplotlib.colors.to_hex(sns.color_palette("PiYG")[0]), matplotlib.colors.to_hex(sns.color_palette("PiYG")[-1]), '#ffce30', '#990099', '#ff9900', '#0000ff']
my_palette = sns.color_palette(colors)
sns.set_palette(my_palette)
my_palette

In [538]:
train_data = pd.read_csv('input/train.csv')
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [539]:
test_data = pd.read_csv('input/test.csv')
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [548]:
train_data['train'] = True
test_data['train'] = False
data = pd.concat([train_data, test_data], ignore_index = True)
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,train
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.000000000,False,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,Maham Ofracculy,False,True
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.000000000,False,109.000000000,9.000000000,25.000000000,549.000000000,44.000000000,Juanna Vines,True,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.000000000,True,43.000000000,3576.000000000,0.000000000,6715.000000000,49.000000000,Altark Susent,False,True
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.000000000,False,0.000000000,1283.000000000,371.000000000,3329.000000000,193.000000000,Solam Susent,False,True
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.000000000,False,303.000000000,70.000000000,151.000000000,565.000000000,2.000000000,Willy Santantines,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.000000000,False,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,Jeron Peter,,False
12966,9269_01,Earth,False,,TRAPPIST-1e,42.000000000,False,0.000000000,847.000000000,17.000000000,10.000000000,144.000000000,Matty Scheron,,False
12967,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.000000000,0.000000000,0.000000000,0.000000000,0.000000000,Jayrin Pore,,False
12968,9273_01,Europa,False,D/297/P,,,False,0.000000000,2680.000000000,0.000000000,0.000000000,523.000000000,Kitakan Conale,,False


In [549]:
data['Group'] = data['PassengerId'].str.split('_').str[0].astype(int)
data['Group_size']=data['Group'].apply(lambda x: data[data['Group']==x].shape[0])

data.loc[data['Name'].notna(), 'Surname']=data.loc[data['Name'].notna(), 'Name'].str.split().str[-1]
data.loc[data['Surname'].notna(), 'Family_size']=data['Surname'].loc[data['Surname'].notna()].apply(lambda x: data[data['Surname']==x].shape[0])
data['Family_size'] = data['Family_size'].astype('Int64')

data.loc[data['Cabin'].notna(), 'Cabin_deck'] = data.loc[data['Cabin'].notna(), 'Cabin'].str.split('/').str[0]
data.loc[data['Cabin'].notna(), 'Cabin_number'] = data.loc[data['Cabin'].notna(), 'Cabin'].str.split('/').str[1].astype(int)
data.loc[data['Cabin'].notna(), 'Cabin_side'] = data.loc[data['Cabin'].notna(), 'Cabin'].str.split('/').str[-1]
data.loc[data['Cabin_deck'].notna(), 'Cabin_deck_size'] = data['Cabin_deck'].loc[data['Cabin_deck'].notna()].apply(lambda x: data[data['Cabin_deck']==x].shape[0])
data['Cabin_deck_size'] = data['Cabin_deck_size'].astype('Int64')
data.loc[data['Cabin_number'].notna(), 'Cabin_number_size'] = data.loc[data['Cabin_deck'].notna()].apply(lambda row: data[(data['Cabin_deck'] == row['Cabin_deck']) & (data['Cabin_number'] == row['Cabin_number'])].shape[0], axis=1)
data['Cabin_number_size'] = data['Cabin_number_size'].astype('Int64')

data.drop(columns=['Cabin', 'Name'], inplace=True)

data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,train,Group,Group_size,Surname,Family_size,Cabin_deck,Cabin_number,Cabin_side,Cabin_deck_size,Cabin_number_size
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,...,True,1,1,Ofracculy,3,B,0.0,P,1141,3
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,...,True,2,1,Vines,4,F,0.0,S,4239,2
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,...,True,3,2,Susent,7,A,0.0,S,354,4
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,...,True,3,2,Susent,7,A,0.0,S,354,4
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,...,True,4,1,Santantines,9,F,1.0,S,4239,2


In [550]:
data['TotalCosts'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall']  + data['Spa'] + data['VRDeck']

In [543]:
columns_for_grouping = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group', 'Surname', 'Cabin_deck', 'Cabin_number', 'Cabin_side', 'TotalCosts']
couples_to_group = []
for i in range(len(columns_for_grouping)):
    for j in range(i+1, len(columns_for_grouping)):
        couples_to_group.append([columns_for_grouping[i], columns_for_grouping[j]])
len(couples_to_group)

120

In [545]:
all_unique = []
not_all_unique = []
counter = 0
for index in range(len(couples_to_group)):
    grouped_data = data.groupby([couples_to_group[index][0],couples_to_group[index][1]])[couples_to_group[index][0]].size().unstack().fillna(0)
    if (len((grouped_data>0).sum(axis=1).unique())==1) and ((grouped_data>0).sum(axis=1).unique()[0]==1):
        dict_groupes = {}
        for column in grouped_data.columns:
            dict_groupes[column] = list(grouped_data[grouped_data[column]>0].index)
        all_unique.append([couples_to_group[index][::-1], dict_groupes])
    elif (len((grouped_data>0).sum(axis=0).unique())==1) and ((grouped_data>0).sum(axis=0).unique()[0]==1):
        dict_groupes = {}
        grouped_data = grouped_data.T
        for column in grouped_data.columns:
            dict_groupes[column] = list(grouped_data[grouped_data[column]>0].index)
        all_unique.append([couples_to_group[index], dict_groupes])
    else:
        if 1 in (grouped_data>0).sum(axis=1).unique():
            dict_groupes = {}
            grouped_data = grouped_data[(grouped_data>0).sum(axis=1) == 1]
            for column in grouped_data.columns:
                if len(list(grouped_data[grouped_data[column]>0].index))>0:
                    dict_groupes[column] = list(grouped_data[grouped_data[column]>0].index)
            not_all_unique.append([couples_to_group[index][::-1], dict_groupes])
        elif 1 in (grouped_data>0).sum(axis=0).unique():
            grouped_data = grouped_data.T
            dict_groupes = {}
            grouped_data = grouped_data[(grouped_data>0).sum(axis=1) == 1]
            for column in grouped_data.columns:
                if len(list(grouped_data[grouped_data[column]>0].index))>0:
                    dict_groupes[column] = list(grouped_data[grouped_data[column]>0].index)
            not_all_unique.append([couples_to_group[index], dict_groupes])
print(len(not_all_unique))
len(all_unique)

104


3

In [551]:
for two_columns in all_unique:
    old_amount = data[two_columns[0][0]].isna().sum()
    for key in two_columns[1].keys():
        data.loc[(data[two_columns[0][0]].isna())&(data[two_columns[0][1]].isin(two_columns[1][key])), two_columns[0][0]] = key
    new_amount = data[two_columns[0][0]].isna().sum()
    if new_amount < old_amount:
        print(f'> Amount of missing values in "{two_columns[0][0]}" column before filling by "{two_columns[0][1]}" column = {old_amount}')
        print(f'After = {new_amount}')

for two_columns in not_all_unique[::-1]:
    old_amount = data[two_columns[0][0]].isna().sum()
    for key in two_columns[1].keys():
        data.loc[(data[two_columns[0][0]].isna())&(data[two_columns[0][1]].isin(two_columns[1][key])), two_columns[0][0]] = key
    new_amount = data[two_columns[0][0]].isna().sum()
    if new_amount < old_amount:
        print(f'> Amount of missing values in "{two_columns[0][0]}" column before filling by "{two_columns[0][1]}" column = {old_amount}')
        print(f'After = {new_amount}')

> Amount of missing values in "HomePlanet" column before filling by "Group" column = 288
After = 157
> Amount of missing values in "HomePlanet" column before filling by "Surname" column = 157
After = 13
> Amount of missing values in "Cabin_side" column before filling by "Group" column = 299
After = 162
> Amount of missing values in "Cabin_side" column before filling by "TotalCosts" column = 162
After = 148
> Amount of missing values in "TotalCosts" column before filling by "Cabin_number" column = 1363
After = 1290
> Amount of missing values in "Cabin_deck" column before filling by "TotalCosts" column = 299
After = 226
> Amount of missing values in "TotalCosts" column before filling by "Surname" column = 1290
After = 1179
> Amount of missing values in "Cabin_side" column before filling by "Surname" column = 148
After = 102
> Amount of missing values in "Cabin_number" column before filling by "Surname" column = 299
After = 270
> Amount of missing values in "Cabin_deck" column before fill

In [552]:
data_original = data.copy()
imputer = IterativeImputer(random_state = 18)
columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
imputed = imputer.fit_transform(data[columns])
data[columns] = pd.DataFrame(np.round(imputed), columns=columns)
data['TotalCosts'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall']  + data['Spa'] + data['VRDeck']

In [555]:
encoder = OrdinalEncoder()
imputer = IterativeImputer(random_state = 18)

cat_cols = ['HomePlanet','CryoSleep','Destination','VIP','Surname', 'Cabin_deck', 'Cabin_side']

def encode(data):
    nonulls = np.array(data.dropna())
    impute_reshape = nonulls.reshape(-1,1)
    impute_ordinal = encoder.fit_transform(impute_reshape)
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data
for columns in cat_cols:
    encode(data[columns])

In [556]:
data = pd.DataFrame(np.round(imputer.fit_transform(data)),columns = data.columns)
data['PassengerId'] = data_original['PassengerId']

In [557]:
data.loc[data['Surname'].notna(), 'Family_size']=data['Surname'].loc[data['Surname'].notna()].apply(lambda x: data[data['Surname']==x].shape[0])
data['Family_size'] = data['Family_size'].astype('int64')
data.loc[data['Cabin_deck'].notna(), 'Cabin_deck_size'] = data['Cabin_deck'].loc[data['Cabin_deck'].notna()].apply(lambda x: data[data['Cabin_deck']==x].shape[0])
data['Cabin_deck_size'] = data['Cabin_deck_size'].astype('int64')
data.loc[data['Cabin_number'].notna(), 'Cabin_number_size'] = data.loc[data['Cabin_deck'].notna()].apply(lambda row: data[(data['Cabin_deck'] == row['Cabin_deck']) & (data['Cabin_number'] == row['Cabin_number'])].shape[0], axis=1)
data['Cabin_number_size'] = data['Cabin_number_size'].astype('int64')

In [558]:
data.isna().sum()

PassengerId          0
HomePlanet           0
CryoSleep            0
Destination          0
Age                  0
VIP                  0
RoomService          0
FoodCourt            0
ShoppingMall         0
Spa                  0
VRDeck               0
Transported          0
train                0
Group                0
Group_size           0
Surname              0
Family_size          0
Cabin_deck           0
Cabin_number         0
Cabin_side           0
Cabin_deck_size      0
Cabin_number_size    0
TotalCosts           0
dtype: int64

In [559]:
data['Adult'] = 1
data.loc[data['Age']<18, 'Adult'] = 0

In [560]:
data['AgeGroup'] = data['Age'] // 10

In [561]:
data['ZeroExpenses'] = 1
data.loc[data['TotalCosts'] > 0, 'ZeroExpenses'] = 0

In [565]:
train_data = data[data['train'] == True]
test_data = data[data['train'] == False]
train_data.drop(columns = ['train'], inplace = True)
test_data.drop(columns = ['Transported', 'train'], inplace = True)
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,Family_size,Cabin_deck,Cabin_number,Cabin_side,Cabin_deck_size,Cabin_number_size,TotalCosts,Adult,AgeGroup,ZeroExpenses
0,0001_01,1.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,...,4,1.0,0.0,0.0,1168,4,0.0,1,3.0,1
1,0002_01,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,...,4,5.0,0.0,1.0,4365,2,736.0,1,2.0,0
2,0003_01,1.0,0.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,...,7,0.0,0.0,1.0,361,4,10383.0,1,5.0,0
3,0003_02,1.0,0.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,...,7,0.0,0.0,1.0,361,4,5176.0,1,3.0,0
4,0004_01,0.0,0.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,...,9,5.0,1.0,1.0,4365,2,1091.0,0,1.0,0


In [566]:
train_data['Transported'] = train_data['Transported'].astype(int)

X = train_data.drop(columns = ['PassengerId', 'Transported'])
y = train_data['Transported']


X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 18, test_size = 0.2)
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,Family_size,Cabin_deck,Cabin_number,Cabin_side,Cabin_deck_size,Cabin_number_size,TotalCosts,Adult,AgeGroup,ZeroExpenses
0,0001_01,1.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,...,4,1.0,0.0,0.0,1168,4,0.0,1,3.0,1
1,0002_01,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,...,4,5.0,0.0,1.0,4365,2,736.0,1,2.0,0
2,0003_01,1.0,0.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,...,7,0.0,0.0,1.0,361,4,10383.0,1,5.0,0
3,0003_02,1.0,0.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,...,7,0.0,0.0,1.0,361,4,5176.0,1,3.0,0
4,0004_01,0.0,0.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,...,9,5.0,1.0,1.0,4365,2,1091.0,0,1.0,0


In [568]:
start_time = time.time()
xgb_table = pd.DataFrame(columns = ['n_estimators', 'max_depth', 'learning_rate', 'gamma', 'train_accuracy', 'valid_accuracy', 'train_precision_score', 'valid_precision_score', 'train_recall_score', 'valid_recall_score', 'train_f1_score', 'valid_f1_score', 'train_log_loss', 'valid_log_loss'])

for n_estimators in range(25, 226, 50):
    for max_depth in range(3, 7):
        for learning_rate in [1, 0.7, 0.5, 0.3, 0.1, 0.01, 0.05]:
            for gamma in [0, 0.25, 0.5, 1]:
                xgb = XGBClassifier(gamma = gamma, n_estimators = n_estimators, max_depth = max_depth, learning_rate = learning_rate, objective="binary:logistic", use_label_encoder=False, random_state = 18).fit(X_train, y_train)

                xgb_train_predicted = xgb.predict(X_train)
                xgb_valid_predicted = xgb.predict(X_valid)

                xgb_table = xgb_table.append(dict(zip(xgb_table.columns,[n_estimators, max_depth, learning_rate, gamma, accuracy_score(y_train, xgb_train_predicted), accuracy_score(y_valid, xgb_valid_predicted), precision_score(y_train, xgb_train_predicted), precision_score(y_valid, xgb_valid_predicted), recall_score(y_train, xgb_train_predicted), recall_score(y_valid, xgb_valid_predicted), f1_score(y_train, xgb_train_predicted), f1_score(y_valid, xgb_valid_predicted), log_loss(y_train, xgb_train_predicted), log_loss(y_valid, xgb_valid_predicted)])), ignore_index=True)

print('XGB table time:', time.time() - start_time)
xgb_table.sort_values(by = 'valid_accuracy', ascending=False).head(5)

XGB table time: 469.91160893440247


Unnamed: 0,n_estimators,max_depth,learning_rate,gamma,train_accuracy,valid_accuracy,train_precision_score,valid_precision_score,train_recall_score,valid_recall_score,train_f1_score,valid_f1_score,train_log_loss,valid_log_loss
549,225.0,6.0,0.1,0.25,0.947656025,0.828637148,0.947010485,0.819112628,0.94970162,0.838183935,0.948354143,0.82853855,1.807918339,5.918736344
485,225.0,4.0,0.5,0.25,0.977423066,0.826912018,0.969815539,0.828235294,0.98607559,0.819557625,0.977877977,0.82387361,0.779792098,5.978314224
321,125.0,6.0,0.3,0.25,0.978861087,0.826336975,0.974662162,0.820483314,0.983802217,0.830034924,0.979210861,0.825231481,0.730122533,5.998180108
540,225.0,6.0,0.5,0.0,0.999712396,0.825186889,0.99943198,0.829964328,1.0,0.812572759,0.999715909,0.821176471,0.009933729,6.037896703
209,75.0,6.0,0.3,0.25,0.950819672,0.825186889,0.949872557,0.821552723,0.953111679,0.825378347,0.951489362,0.823461092,1.698648699,6.037901761


In [570]:
def make_transported_boolean():
    test_data.replace({'Transported': {1 : True, 0: False}}, inplace = True)

In [572]:
xgb_clf = XGBClassifier(gamma = 0.25, n_estimators = 225, max_depth = 6, learning_rate = 0.1, objective="binary:logistic", use_label_encoder=False, random_state = 18).fit(X, y)

test_data['Transported'] = xgb_clf.predict(test_data.drop(columns = ['PassengerId']))
test_data.replace({'Transported': {1 : True, 0: False}}, inplace = True)
test_data[['PassengerId', 'Transported']].to_csv('submission.csv',index=False)