# tabular classification - Spaceship Titanic

In [1]:
is_first = False

if is_first:
    import zipfile
    file = zipfile.ZipFile("spaceship-titanic.zip")
    file.extractall()

In [2]:
import pandas as pd
import ydata_profiling
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [6]:
# train.profile_report()

In [7]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
# group id
train['GroupId'] = train['PassengerId'].apply(lambda x: int(x.split('_')[0]))

In [4]:
# split cabin
for idx in range(3):
    train.loc[train['Cabin'].notna(), 'Cabin_' + str(idx)] = train.loc[train['Cabin'].notna(), 'Cabin'].astype(str).apply(lambda x: list(x.split('/'))[idx])

In [10]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupId,Cabin_0,Cabin_1,Cabin_2
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,F,1,S


In [5]:
# drop
drop_cols = ['PassengerId', 'Cabin', 'Name']

train = train.drop(columns=drop_cols, axis=1)

In [6]:
train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupId,Cabin_0,Cabin_1,Cabin_2
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,F,1,S


In [7]:
# dummies 

dum_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'GroupId', 'Cabin_0', 'Cabin_1', 'Cabin_2']

train = pd.get_dummies(train, columns=dum_cols, drop_first=True)

In [8]:
from sklearn.model_selection import train_test_split

target = 'Transported'

X = train.drop(target, axis=1)
y = train.loc[:, target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y)

In [15]:
y_train.head()

5505     True
1512     True
6430     True
3772    False
2573    False
Name: Transported, dtype: bool

In [9]:
# missing values

from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)

In [10]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((6085, 8052), (6085,), (2608, 8052), (2608,))

In [18]:
y_train = y_train.replace({True:1, False:0})
y_val = y_val.replace({True:1, False:0})

In [19]:
y_train = np.expand_dims(y_train.values, axis=0)
y_val = np.expand_dims(y_val.values, axis=0)

In [20]:
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

In [21]:
y_train.shape

(6085, 1)

In [22]:
train_set = pd.DataFrame(np.concatenate((X_train, y_train), axis=1))
val_set = pd.DataFrame(np.concatenate((X_val, y_val), axis=1))

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [27]:
# modelling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

lr = LogisticRegression()
lr.fit(X_train, y_train)

dt = DecisionTreeClassifier(max_depth=10)
dt.fit(X_train, y_train)

rf = RandomForestClassifier(max_depth=10)
rf.fit(X_train, y_train)

xgb = XGBClassifier(max_depth=10)
xgb.fit(X_train, y_train)

lgbm = LGBMClassifier(max_depth=10)
lgbm.fit(X_train, y_train)

cb = CatBoostClassifier(max_depth=10)
cb.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Learning rate set to 0.022275
0:	learn: 0.6809220	total: 112ms	remaining: 1m 51s
1:	learn: 0.6690927	total: 160ms	remaining: 1m 19s
2:	learn: 0.6567626	total: 208ms	remaining: 1m 9s
3:	learn: 0.6438232	total: 256ms	remaining: 1m 3s
4:	learn: 0.6343444	total: 302ms	remaining: 1m
5:	learn: 0.6245069	total: 347ms	remaining: 57.4s
6:	learn: 0.6151587	total: 394ms	remaining: 55.9s
7:	learn: 0.6065087	total: 443ms	remaining: 55s
8:	learn: 0.5987069	total: 491ms	remaining: 54.1s
9:	learn: 0.5920635	total: 518ms	remaining: 51.3s
10:	learn: 0.5849536	total: 566ms	remaining: 50.9s
11:	learn: 0.5784847	total: 618ms	remaining: 50.9s
12:	learn: 0.5697856	total: 668ms	remaining: 50.7s
13:	learn: 0.5632198	total: 717ms	remaining: 50.5s
14:	learn: 0.5563856	total: 766ms	remaining: 50.3s
15:	learn: 0.5522948	total: 784ms	remaining: 48.2s
16:	learn: 0.5472070	total: 833ms	remaining: 48.2s
17:	learn: 0.5417539	total: 880ms	remaining: 48s
18:	learn: 0.5367570	total: 928ms	remaining: 47.9s
19:	learn: 0.533

<catboost.core.CatBoostClassifier at 0x7f6dcb0b12e0>

In [28]:
from sklearn.metrics import accuracy_score

lr_pred = lr.predict(X_val)
dt_pred = dt.predict(X_val)
rf_pred = rf.predict(X_val)
xgb_pred = xgb.predict(X_val)
lgbm_pred = lgbm.predict(X_val)
cb_pred = cb.predict(X_val)

print(accuracy_score(y_val, lr_pred))
print(accuracy_score(y_val, dt_pred))
print(accuracy_score(y_val, rf_pred))
print(accuracy_score(y_val, xgb_pred))
print(accuracy_score(y_val, lgbm_pred))
print(accuracy_score(y_val, cb_pred))

0.7074386503067485
0.7783742331288344
0.7511503067484663
0.8059815950920245
0.8094325153374233
0.8086656441717791


In [25]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

params = {
    'n_estimator' : range(10, 101, 10),
    'max_depth' : range(10, 15, 5)
}

xgb_grid = GridSearchCV(XGBClassifier(), params, scoring='accuracy', n_jobs=-1, verbose=2)

xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


: 

: 