# Spaceship Titanic

## Setup

In [20]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
df = pd.read_csv("data/train.csv")
df.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
158,0179_03,Earth,False,G/26/P,TRAPPIST-1e,3.0,False,0.0,0.0,0.0,0.0,0.0,Sallyl Webstephrey,True
5472,5836_01,Europa,True,B/222/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Phah Irintious,True
5203,5550_01,Earth,False,G/895/S,TRAPPIST-1e,33.0,False,127.0,745.0,9.0,0.0,0.0,Robyna Poperez,True
2546,2730_03,Earth,True,G/433/S,TRAPPIST-1e,16.0,False,0.0,,0.0,0.0,0.0,Elancy Douglasen,True
1333,1412_01,Earth,False,G/217/S,55 Cancri e,15.0,False,0.0,0.0,,0.0,832.0,Andona Winsley,False


In [22]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


## Feature Engineering

In [23]:
def process_data(df):
    featuresToDrop = ["PassengerId", "Destination", "Name", "Cabin", "HomePlanet", "Num"]
    df[['Deck', 'Num', 'Side']] = df.Cabin.str.split('/', expand=True)  
    df.drop(columns=featuresToDrop, inplace=True)
    
    df["CryoSleep"] = df["CryoSleep"].astype(bool)
    df["VIP"] = df["VIP"].astype(bool)
    df["Deck"] = df["Deck"].astype("category")
    df["Side"] = df["Side"].astype("category")


process_data(df)
df.sample(5)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
1548,False,21.0,False,305.0,0.0,1.0,618.0,2.0,True,G,P
2980,False,,False,16.0,0.0,799.0,0.0,37.0,False,G,P
2316,False,14.0,False,148.0,0.0,34.0,808.0,0.0,False,E,S
3737,False,17.0,False,3.0,83.0,927.0,0.0,1.0,True,G,P
2199,False,13.0,False,880.0,0.0,0.0,0.0,0.0,False,F,S


## Train

In [24]:
y = df["Transported"]
df.drop(columns=["Transported"], inplace=True)
x = df

In [32]:
best_model, best_acc = None, -1

for i in tqdm(range(100)):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    x_train.head()
    train = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
    param = {
        "max_depth": 4,
        "eta": 0.05,
        "objective": "multi:softprob",
        "num_class": 3
    }
    epochs = 75

    model = xgb.train(param, train, epochs)
    predictions = np.argmax(model.predict(xgb.DMatrix(x, enable_categorical=True)), axis=1)
    acc = accuracy_score(predictions, y)
    if acc >= best_acc:
        best_acc = acc
        best_model = model
        
best_model.save_model("model.json") 
print("Best model found with an accuracy of: " + str(best_acc))
print("Model created and saved!!!")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:45<00:00,  2.22it/s]

Best model found with an accuracy of: 0.8131830208213505
Model created and saved!!!





## Run Model w/ Test Set

In [33]:
test_set = pd.read_csv("data/test.csv")
predictions_series = test_set["PassengerId"]
process_data(test_set)

model = xgb.Booster()
model.load_model("model.json")
survived = model.predict(xgb.DMatrix(test_set, enable_categorical=True))
prediction_list = np.argmax(survived, axis=1).astype(bool).tolist()
predictions = pd.DataFrame(predictions_series)
predictions["Transported"] = prediction_list
predictions.head()
predictions.to_csv("results.csv", index=False)
print("Ran model & saved results to results.csv!!!")

Ran model & saved results to results.csv!!!
