In [1]:
import pandas as pd

In [2]:
full_df = pd.read_csv('dataset/train.csv')
df = full_df.copy()
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2)

In [4]:
from sklearn.impute import SimpleImputer

In [5]:
name_imputer = SimpleImputer(strategy='constant', fill_value='Unknown Unknown')
num_imputer = SimpleImputer(strategy='median')
spend_imputer = SimpleImputer(strategy='constant', fill_value=0)
cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')

In [6]:
num_col = ['Age']
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

df_train.loc[:, 'Name'] = name_imputer.fit_transform(df_train.loc[:, 'Name'].to_frame())
df_train.loc[:, num_col] = num_imputer.fit_transform(df_train.loc[:, num_col])
df_train.loc[:, spend_cols] = spend_imputer.fit_transform(df_train.loc[:, spend_cols])
df_train.loc[:, cat_cols] = cat_imputer.fit_transform(df_train.loc[:, cat_cols])

In [7]:
def update_df(dataframe): 
    df = dataframe.copy()

    df.loc[:, 'FirstName'] = df.loc[:, 'Name'].apply(lambda x: x.split()[0])
    df.loc[:, 'LastName'] = df.loc[:, 'Name'].apply(lambda x: x.split()[1])

    df.loc[:, 'PassengerGGGG'] = df.loc[:, 'PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df.loc[:, 'PassengerPP'] = df.loc[:, 'PassengerId'].apply(lambda x: int(x.split('_')[1]))
    
    df.loc[:, 'NumberOfFellows'] = df.groupby('PassengerGGGG')['PassengerGGGG'].transform('count')
    df.loc[:, 'IsAlone'] = df.loc[:, 'NumberOfFellows'].apply(lambda x: 1 if x == 1 else 0)

    df['cabin_deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if x != 'Unknown' else 'U')
    df['cabin_num'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if x != 'Unknown' else -1)
    df['cabin_side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if x != 'Unknown' else 'U')

    df.loc[:, 'TotalSpend'] = df.loc[:, ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    df['IsSpendingZero'] = (df['TotalSpend'] == 0).astype(int)
    
    df.drop(['Cabin', 'Name', 'PassengerId'], inplace=True, axis=1)

    return df

In [8]:
df_train_updated = update_df(df_train)

In [9]:
lastnames_have_impact_mask = df_train_updated.groupby('LastName')['Transported'].mean() >= 0.5
lastnames_have_impact_mask = lastnames_have_impact_mask[lastnames_have_impact_mask].index.tolist()

firstnames_have_impact_mask = df_train_updated.groupby('FirstName')['Transported'].mean() >= 0.5
firstnames_have_impact_mask = firstnames_have_impact_mask[firstnames_have_impact_mask].index.tolist()

passenger_gggg_have_impact_mask = df_train_updated.groupby('PassengerGGGG')['Transported'].mean() >= 0.5
passenger_gggg_have_impact_mask = passenger_gggg_have_impact_mask[passenger_gggg_have_impact_mask].index.tolist()


df_train_updated.loc[:, 'FirstNameHaveImpact'] = df_train_updated.loc[:, 'FirstName'].apply(lambda x: 1 if x in firstnames_have_impact_mask else 0)
df_train_updated.loc[:, 'LastNameHaveImpact'] = df_train_updated.loc[:, 'LastName'].apply(lambda x: 1 if x in lastnames_have_impact_mask else 0)
df_train_updated.loc[:, 'PassengerGGGGHaveImpact'] = df_train_updated.loc[:, 'PassengerGGGG'].apply(lambda x: 1 if x in passenger_gggg_have_impact_mask else 0)

df_train_updated.drop(['FirstName', 'LastName', 'PassengerGGGG'], inplace=True, axis=1)

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
num_scaler = StandardScaler()

In [12]:
num_cols = df_train_updated.select_dtypes('float').columns.tolist()
df_train_updated.loc[:, num_cols] = num_scaler.fit_transform(df_train_updated.loc[:, num_cols])

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
train_cat_data = df_train_updated.select_dtypes('object')
train_cat_data.loc[:, ['CryoSleep', 'VIP']] = train_cat_data.loc[:, ['CryoSleep', 'VIP']].astype('str')

one_hot_encoder = OneHotEncoder(drop='first', dtype=int)

train_cat_data_encoded = one_hot_encoder.fit_transform(train_cat_data)
train_encoded_feature_names = one_hot_encoder.get_feature_names_out(train_cat_data.columns)

df_train_cat_encoded = pd.DataFrame(train_cat_data_encoded.toarray(), columns = train_encoded_feature_names, index=train_cat_data.index)

df_train_non_cat = df_train_updated.drop(train_cat_data.columns, axis=1)
df_train_encoded = pd.concat([df_train_non_cat, df_train_cat_encoded], axis=1)

In [15]:
##### TEST PART

In [16]:
num_col = ['Age']
spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

df_test.loc[:, 'Name'] = name_imputer.transform(df_test.loc[:, 'Name'].to_frame())
df_test.loc[:, num_col] = num_imputer.transform(df_test.loc[:, num_col])
df_test.loc[:, spend_cols] = spend_imputer.transform(df_test.loc[:, spend_cols])
df_test.loc[:, cat_cols] = cat_imputer.transform(df_test.loc[:, cat_cols])

In [17]:
df_test_updated = update_df(df_test)

In [18]:
df_test_updated.loc[:, 'FirstNameHaveImpact'] = df_test_updated.loc[:, 'FirstName'].apply(lambda x: 1 if x in firstnames_have_impact_mask else 0)
df_test_updated.loc[:, 'LastNameHaveImpact'] = df_test_updated.loc[:, 'LastName'].apply(lambda x: 1 if x in lastnames_have_impact_mask else 0)
df_test_updated.loc[:, 'PassengerGGGGHaveImpact'] = df_test_updated.loc[:, 'PassengerGGGG'].apply(lambda x: 1 if x in passenger_gggg_have_impact_mask else 0)

df_test_updated.drop(['FirstName', 'LastName', 'PassengerGGGG'], inplace=True, axis=1)

In [19]:
num_cols = df_test_updated.select_dtypes('float').columns.tolist()
df_test_updated.loc[:, num_cols] = num_scaler.transform(df_test_updated.loc[:, num_cols])

In [20]:
test_cat_data = df_test_updated.select_dtypes('object')
test_cat_data.loc[:, ['CryoSleep', 'VIP']] = test_cat_data.loc[:, ['CryoSleep', 'VIP']].astype('str')

test_cat_data_encoded = one_hot_encoder.transform(test_cat_data)
test_encoded_feature_names = one_hot_encoder.get_feature_names_out(test_cat_data.columns)

df_test_cat_encoded = pd.DataFrame(test_cat_data_encoded.toarray(), columns = test_encoded_feature_names, index=test_cat_data.index)

df_test_non_cat = df_test_updated.drop(test_cat_data.columns, axis=1)
df_test_encoded = pd.concat([df_test_non_cat, df_test_cat_encoded], axis=1)

In [21]:
X_train = df_train_encoded.drop('Transported', axis=1)
y_train = df_train_encoded.loc[:, 'Transported']

X_test = df_test_encoded.drop('Transported', axis=1)
y_test = df_test_encoded.loc[:, 'Transported']

In [22]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

models

  0%|          | 0/31 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 3505, number of negative: 3449
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1955
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504026 -> initscore=0.016106
[LightGBM] [Info] Start training from score 0.016106


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GaussianNB,0.74,0.74,0.74,0.74,0.04
KNeighborsClassifier,0.7,0.7,0.7,0.7,0.11
LabelPropagation,0.68,0.68,0.68,0.67,1.5
LabelSpreading,0.68,0.68,0.68,0.67,1.98
NearestCentroid,0.68,0.68,0.68,0.67,0.01
PassiveAggressiveClassifier,0.67,0.67,0.67,0.65,0.02
BernoulliNB,0.66,0.66,0.66,0.65,0.02
LogisticRegression,0.64,0.64,0.64,0.62,0.02
AdaBoostClassifier,0.64,0.64,0.64,0.61,0.26
Perceptron,0.63,0.63,0.63,0.61,0.01


In [23]:
df_train_encoded

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerPP,NumberOfFellows,IsAlone,...,cabin_deck_B,cabin_deck_C,cabin_deck_D,cabin_deck_E,cabin_deck_F,cabin_deck_G,cabin_deck_T,cabin_deck_U,cabin_side_S,cabin_side_U
6072,0.15,-0.33,-0.29,1.17,-0.27,-0.26,True,1,1,1,...,0,0,0,1,0,0,0,0,1,0
2163,2.10,-0.33,1.13,-0.28,3.26,-0.25,False,3,3,0,...,0,0,1,0,0,0,0,0,0,0
6353,-0.05,-0.33,-0.29,-0.28,-0.27,-0.26,True,1,3,0,...,0,0,0,1,0,0,0,0,1,0
1302,-0.82,-0.30,-0.29,0.87,-0.04,-0.25,False,1,4,0,...,0,0,0,1,0,0,0,0,1,0
1923,2.73,-0.33,-0.10,-0.28,1.24,-0.25,False,3,4,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4329,-1.03,-0.33,-0.18,0.72,-0.27,-0.26,True,1,1,1,...,0,0,0,0,0,1,0,0,1,0
7418,-0.54,-0.32,-0.29,0.75,-0.27,-0.26,False,1,3,0,...,0,0,0,0,1,0,0,0,0,0
2263,-0.19,-0.33,-0.29,-0.28,-0.27,-0.26,True,1,1,1,...,1,0,0,0,0,0,0,0,1,0
7136,-0.19,-0.33,-0.29,-0.28,-0.27,-0.26,True,1,1,1,...,0,0,0,0,1,0,0,0,1,0


PYTORCH BITCH

In [32]:
TARGET_COL = 'Transported'
FEATURES   = [c for c in df_train_encoded.columns if c != TARGET_COL]

X_train = df_train_encoded[FEATURES].to_numpy(dtype="float32")
y_train = df_train_encoded[TARGET_COL].to_numpy(dtype="float32").reshape(-1, 1)

X_test  = df_test_encoded[FEATURES].to_numpy(dtype="float32")
y_test  = df_test_encoded[TARGET_COL].to_numpy(dtype="float32").reshape(-1, 1)

In [33]:
import torch
from torch.utils.data import Dataset, DataLoader

class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = TabularDataset(X_train, y_train)
test_ds  = TabularDataset(X_test,  y_test)

In [34]:
BATCH_SIZE = 64
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)

In [35]:
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)          # 1 logit → binary outcome
        )
    def forward(self, x):
        return self.net(x).squeeze(1)  # shape (batch,)


In [38]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model  = MLP(in_features=X_train.shape[1]).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 100
for epoch in range(1, EPOCHS + 1):
    model.train()
    epoch_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        logits = model(xb)            # raw scores
        loss   = criterion(logits, yb.squeeze(1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * xb.size(0)

    avg_loss = epoch_loss / len(train_ds)
    print(f"Epoch {epoch:02d} | train‑loss = {avg_loss:.4f}")

Epoch 01 | train‑loss = 0.7219
Epoch 02 | train‑loss = 0.5508
Epoch 03 | train‑loss = 0.6481
Epoch 04 | train‑loss = 0.3514
Epoch 05 | train‑loss = 0.2756
Epoch 06 | train‑loss = 0.2478
Epoch 07 | train‑loss = 0.2402
Epoch 08 | train‑loss = 0.2165
Epoch 09 | train‑loss = 0.2573
Epoch 10 | train‑loss = 0.2265
Epoch 11 | train‑loss = 0.1987
Epoch 12 | train‑loss = 0.2087
Epoch 13 | train‑loss = 0.2989
Epoch 14 | train‑loss = 0.1905
Epoch 15 | train‑loss = 0.1815
Epoch 16 | train‑loss = 0.1899
Epoch 17 | train‑loss = 0.2391
Epoch 18 | train‑loss = 0.2120
Epoch 19 | train‑loss = 0.1757
Epoch 20 | train‑loss = 0.2029
Epoch 21 | train‑loss = 0.2085
Epoch 22 | train‑loss = 0.1869
Epoch 23 | train‑loss = 0.2044
Epoch 24 | train‑loss = 0.2147
Epoch 25 | train‑loss = 0.1581
Epoch 26 | train‑loss = 0.1603
Epoch 27 | train‑loss = 0.1484
Epoch 28 | train‑loss = 0.1845
Epoch 29 | train‑loss = 0.1494
Epoch 30 | train‑loss = 0.1490
Epoch 31 | train‑loss = 0.1660
Epoch 32 | train‑loss = 0.1908
Epoch 33

In [39]:
from sklearn.metrics import accuracy_score, roc_auc_score

model.eval()
all_logits, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        logits = model(xb)
        all_logits.append(torch.sigmoid(logits).cpu())
        all_labels.append(yb)

y_prob = torch.cat(all_logits).numpy()
y_true = torch.cat(all_labels).numpy()

print("Accuracy :", accuracy_score(y_true, y_prob > 0.5))
print("ROC AUC  :", roc_auc_score(y_true, y_prob))


Accuracy : 0.5899942495687176
ROC AUC  : 0.6458239089545487
