# Spaceship Titanic

## Setup

In [160]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [161]:
df = pd.read_csv("data/train.csv")
df.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
4925,5252_01,Earth,False,F/1008/S,TRAPPIST-1e,35.0,False,202.0,1309.0,60.0,0.0,0.0,Elanie Langatein,True
7844,8371_01,Earth,False,F/1605/S,55 Cancri e,52.0,False,0.0,0.0,638.0,115.0,0.0,Everly Lewinez,True
1541,1633_01,Earth,True,G/261/P,TRAPPIST-1e,4.0,False,0.0,0.0,0.0,0.0,,Ferry Thony,True
7436,7950_03,Earth,False,E/516/P,55 Cancri e,22.0,False,10.0,0.0,624.0,0.0,85.0,Done Ingston,True
882,0944_02,Earth,False,G/137/P,55 Cancri e,8.0,False,0.0,0.0,0.0,0.0,0.0,Terrie Weaverays,False


In [162]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## Feature Engineering

In [163]:
def process_data(df):
    df["CryoSleep"] = df["CryoSleep"].astype(bool)
    df["VIP"] = df["VIP"].astype(bool)
    featuresToDrop = ["PassengerId", "Destination", "Name", "Cabin", "HomePlanet"]
    
    deck, side = [], []
    for row in df.itertuples():
        if not pd.isnull(row.Cabin):
            deck.append(row.Cabin.split("/")[0])
            side.append(row.Cabin.split("/")[2])
        else:
            deck.append(np.NaN)
            side.append(np.NaN)
    
    df.drop(columns=featuresToDrop, inplace=True)

    #df["Deck"] = deck
    #df["Side"] = side

process_data(df)
df.sample(5)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
501,True,30.0,False,0.0,0.0,0.0,0.0,0.0,True
6590,False,28.0,True,1998.0,0.0,13.0,87.0,0.0,False
7904,False,25.0,False,1.0,0.0,1.0,14.0,1491.0,False
7683,False,39.0,False,19.0,3603.0,0.0,421.0,517.0,True
1655,False,67.0,False,26.0,809.0,0.0,39.0,26.0,False


## Train

In [164]:
y = df["Transported"]
df.drop(columns=["Transported"], inplace=True)
x = df

In [165]:
best_model, best_acc = None, -1

for i in tqdm(range(100)):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    x_train.head()
    train = xgb.DMatrix(x_train, label=y_train)
    param = {
        "max_depth": 3,
        "eta": 0.05,
        "objective": "multi:softprob",
        "num_class": 3
    }
    epochs = 25

    model = xgb.train(param, train, epochs)
    predictions = np.argmax(model.predict(xgb.DMatrix(x)), axis=1)
    acc = accuracy_score(predictions, y)
    if acc >= best_acc:
        best_acc = acc
        best_model = model
        
best_model.save_model("model.json") 
print("Best model found with an accuracy of: " + str(best_acc))
print("Model created and saved!!!")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:10<00:00,  9.46it/s]


Best model found with an accuracy of: 0.7853445300816749
Model created and saved!!!


## Run Model w/ Test Set

In [184]:
test_set = pd.read_csv("data/test.csv")
predictions_series = test_set["PassengerId"]
process_data(test_set)

model = xgb.Booster()
model.load_model("model.json")
survived = model.predict(xgb.DMatrix(test_set))
prediction_list = np.argmax(survived, axis=1).astype(bool).tolist()
predictions = pd.DataFrame(predictions_series)
predictions["Transported"] = prediction_list
predictions.head()
predictions.to_csv("results.csv", index=False)
print("Ran model & saved results to results.csv!!!")

Ran model & saved results to results.csv!!!
