# Spaceship Titanic

The task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. More details here: https://www.kaggle.com/competitions/spaceship-titanic/overview

# 0. Nescessary Imports

In [1]:
import pandas as pd
import numpy as np

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# **1. Data Preparation**

## **1.1. Train data cleaning**

In [2]:
# Load train DataSet
titanic_train_df = pd.read_csv('train.csv')

# Drop Name
# Drop Name
titanic_train_df = titanic_train_df.drop(columns=['Name'])

# Make PassengerId relevant
titanic_train_df[['PassengerGroup', 'PassengerNumber']] = titanic_train_df["PassengerId"].str.split("_", n=1, expand=True)
titanic_train_df = titanic_train_df.drop(columns=['PassengerId'])

# Make Cabin relevant
titanic_train_df[['Deck', 'Num', 'Side']] = titanic_train_df['Cabin'].str.split('/', expand=True)
titanic_train_df = titanic_train_df.drop(columns=['Cabin', 'Num'])

# Fill VIP
service_column = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
vip_mean = titanic_train_df[titanic_train_df['VIP'] == 1][service_column].mean()
service_condition = (titanic_train_df[service_column] > vip_mean).sum(axis=1)
titanic_train_df.loc[titanic_train_df['VIP'].isnull(), 'VIP'] = np.where(service_condition[titanic_train_df['VIP'].isnull()] > 2, 1, 0)

# Fill CryoSleep
is_cryo = (titanic_train_df[service_column] == 0).all(axis=1)
titanic_train_df.loc[is_cryo, 'CryoSleep'] = True
titanic_train_df.loc[~is_cryo, 'CryoSleep'] = False

# Fill services
for column in service_column:
    for boolean in [True, False]:
        is_vip = titanic_train_df['VIP'] == boolean
        mean = titanic_train_df.loc[is_vip, column].mean().astype(float)
        titanic_train_df.loc[is_vip, column] = titanic_train_df.loc[is_vip, column].fillna(mean)
titanic_train_df.loc[titanic_train_df['CryoSleep'] == True, service_column] = 0.0

# Fill HomePlanet
is_vip = (titanic_train_df['VIP'] == True)
titanic_train_df.loc[titanic_train_df['HomePlanet'].isnull() & is_vip, 'HomePlanet'] = 'Europa'
titanic_train_df.loc[titanic_train_df['HomePlanet'].isnull() & ~is_vip, 'HomePlanet'] = 'Earth'

# Fill Deck
titanic_train_df.loc[(titanic_train_df['HomePlanet'] == 'Earth') & (titanic_train_df['Deck'].isnull()), 'Deck'] = 'G'
titanic_train_df.loc[(titanic_train_df['HomePlanet'] == 'Mars') & (titanic_train_df['Deck'].isnull()), 'Deck'] = 'F'
titanic_train_df.loc[(titanic_train_df['HomePlanet'] == 'Europa') & (titanic_train_df['Deck'].isnull()), 'Deck'] = 'C'

# Fill Side
titanic_train_df['Side'] = titanic_train_df['Side'].fillna(titanic_train_df['Side'].mode()[0])

# Fill Destination
titanic_train_df['Destination'] = titanic_train_df['Destination'].fillna(titanic_train_df['Destination'].mode()[0])

# Fill Age
titanic_train_df['Age'] = titanic_train_df['Age'].fillna(titanic_train_df['Age'].mean())

# Manage column types and Encoding Categorical Variable
titanic_train_df['PassengerGroup'] = titanic_train_df['PassengerGroup'].astype(int)
titanic_train_df['PassengerNumber'] = titanic_train_df['PassengerNumber'].astype(int)
titanic_train_df['VIP'] = titanic_train_df['VIP'].astype(bool)
titanic_train_df['CryoSleep'] = titanic_train_df['CryoSleep'].astype(bool)
titanic_train_df.loc[titanic_train_df['Side']=='P','Side']=0
titanic_train_df.loc[titanic_train_df['Side']=='S','Side']=1
titanic_train_df['Side'] = titanic_train_df['Side'].astype(bool)
titanic_train_df = pd.get_dummies(titanic_train_df, columns=['HomePlanet', 'Destination', 'Deck'])

In [3]:
# Load train DataSet
titanic_test_df = pd.read_csv('test.csv')

# Drop Name
# Drop Name
titanic_test_df = titanic_test_df.drop(columns=['Name'])

# Make PassengerId relevant
titanic_test_df[['PassengerGroup', 'PassengerNumber']] = titanic_test_df["PassengerId"].str.split("_", n=1, expand=True)
titanic_test_df = titanic_test_df.drop(columns=['PassengerId'])

# Make Cabin relevant
titanic_test_df[['Deck', 'Num', 'Side']] = titanic_test_df['Cabin'].str.split('/', expand=True)
titanic_test_df = titanic_test_df.drop(columns=['Cabin', 'Num'])

# Fill VIP
service_column = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
vip_mean = titanic_test_df[titanic_test_df['VIP'] == 1][service_column].mean()
service_condition = (titanic_test_df[service_column] > vip_mean).sum(axis=1)
titanic_test_df.loc[titanic_test_df['VIP'].isnull(), 'VIP'] = np.where(service_condition[titanic_test_df['VIP'].isnull()] > 2, 1, 0)

# Fill CryoSleep
is_cryo = (titanic_test_df[service_column] == 0).all(axis=1)
titanic_test_df.loc[is_cryo, 'CryoSleep'] = True
titanic_test_df.loc[~is_cryo, 'CryoSleep'] = False

# Fill services
for column in service_column:
    for boolean in [True, False]:
        is_vip = titanic_test_df['VIP'] == boolean
        mean = titanic_test_df.loc[is_vip, column].mean().astype(float)
        titanic_test_df.loc[is_vip, column] = titanic_test_df.loc[is_vip, column].fillna(mean)
titanic_test_df.loc[titanic_test_df['CryoSleep'] == True, service_column] = 0.0

# Fill HomePlanet
is_vip = (titanic_test_df['VIP'] == True)
titanic_test_df.loc[titanic_test_df['HomePlanet'].isnull() & is_vip, 'HomePlanet'] = 'Europa'
titanic_test_df.loc[titanic_test_df['HomePlanet'].isnull() & ~is_vip, 'HomePlanet'] = 'Earth'

# Fill Deck
titanic_test_df.loc[(titanic_test_df['HomePlanet'] == 'Earth') & (titanic_test_df['Deck'].isnull()), 'Deck'] = 'G'
titanic_test_df.loc[(titanic_test_df['HomePlanet'] == 'Mars') & (titanic_test_df['Deck'].isnull()), 'Deck'] = 'F'
titanic_test_df.loc[(titanic_test_df['HomePlanet'] == 'Europa') & (titanic_test_df['Deck'].isnull()), 'Deck'] = 'C'

# Fill Side
titanic_test_df['Side'] = titanic_test_df['Side'].fillna(titanic_test_df['Side'].mode()[0])

# Fill Destination
titanic_test_df['Destination'] = titanic_test_df['Destination'].fillna(titanic_test_df['Destination'].mode()[0])

# Fill Age
titanic_test_df['Age'] = titanic_test_df['Age'].fillna(titanic_test_df['Age'].mean())

# Manage column types and Encoding Categorical Variable
titanic_test_df['PassengerGroup'] = titanic_test_df['PassengerGroup'].astype(int)
titanic_test_df['PassengerNumber'] = titanic_test_df['PassengerNumber'].astype(int)
titanic_test_df['VIP'] = titanic_test_df['VIP'].astype(bool)
titanic_test_df['CryoSleep'] = titanic_test_df['CryoSleep'].astype(bool)
titanic_test_df.loc[titanic_test_df['Side']=='P','Side']=0
titanic_test_df.loc[titanic_test_df['Side']=='S','Side']=1
titanic_test_df['Side'] = titanic_test_df['Side'].astype(bool)
titanic_test_df = pd.get_dummies(titanic_test_df, columns=['HomePlanet', 'Destination', 'Deck'])

# **2. Fitting the Models and Classification**

In [4]:
# Split into train and validation
features = titanic_train_df.drop(['Transported'], axis=1)
target = titanic_train_df['Transported']
X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size=0.2, random_state=0)

# Normalize train and validation data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(titanic_test_df)

models = {
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=600),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "SVC": SVC(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "RandomForestClassifier": RandomForestClassifier()
}

for model_name, model in models.items():
    model.fit(X_train, Y_train)

    predictions = model.predict(X_test)

    titanic_test_df['PassengerId'] = titanic_test_df['PassengerGroup'].astype(str).str.zfill(4) + '_' + \
                                     titanic_test_df['PassengerNumber'].astype(str).str.zfill(2)
    predictions_df = pd.DataFrame({
        'PassengerId': titanic_test_df['PassengerId'],
        'Transported': predictions
    })

    predictions_df.to_csv(f"{model_name}_predictions.csv", index=False)