In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
from sklearn.model_selection import train_test_split


X = df.drop('Transported', axis=1)
y = df.loc[:, 'Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.impute import SimpleImputer

num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
num_imputer = SimpleImputer(strategy='median')

X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

In [5]:
cat_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']
cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')

X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

In [6]:
def update_dataset(dataframe):
    df_cleaned = dataframe.copy()

    df_cleaned['cabin_deck'] = df_cleaned['Cabin'].apply(lambda x: x.split('/')[0] if x != 'Unknown' else 'U')
    df_cleaned['cabin_num'] = df_cleaned['Cabin'].apply(lambda x: int(x.split('/')[1]) if x != 'Unknown' else -1)
    df_cleaned['cabin_side'] = df_cleaned['Cabin'].apply(lambda x: x.split('/')[2] if x != 'Unknown' else 'U')

    df_cleaned['passenger_gggg'] = df_cleaned['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df_cleaned['passenger_pp'] = df_cleaned['PassengerId'].apply(lambda x: int(x.split('_')[1]))

    df_cleaned = df_cleaned.drop(['PassengerId', 'Name', 'Cabin'], axis=1)

    return df_cleaned

In [7]:
X_train_cleaned = update_dataset(X_train)
X_test_cleaned = update_dataset(X_test)

In [8]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer


categorical_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_deck', 'cabin_side']
numerical_cols = [col for col in X_train_cleaned.columns if col not in categorical_cols]

for col in categorical_cols:
    X_train_cleaned[col] = X_train_cleaned[col].astype(str)
    X_test_cleaned[col] = X_test_cleaned[col].astype(str)

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

column_transformer = ColumnTransformer(transformers=[
    ('num', MinMaxScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', dtype=int), categorical_cols)
])

X_train_processed = column_transformer.fit_transform(X_train_cleaned)
X_test_processed = column_transformer.transform(X_test_cleaned)

cat_feature_names = column_transformer.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols + list(cat_feature_names)

X_train_final = pd.DataFrame(X_train_processed.toarray() if hasattr(X_train_processed, 'toarray') else X_train_processed,
                             columns=all_feature_names)
X_test_final = pd.DataFrame(X_test_processed.toarray() if hasattr(X_test_processed, 'toarray') else X_test_processed,
                            columns=all_feature_names)

In [9]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train_final, X_test_final, y_train, y_test)

models

  0%|          | 0/31 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1930
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.81,0.81,0.81,0.81,0.07
RandomForestClassifier,0.8,0.8,0.8,0.8,0.61
XGBClassifier,0.79,0.79,0.79,0.79,0.17
ExtraTreesClassifier,0.79,0.79,0.79,0.79,0.43
SVC,0.79,0.79,0.79,0.79,0.8
NuSVC,0.79,0.79,0.79,0.79,1.02
AdaBoostClassifier,0.79,0.79,0.79,0.79,0.24
LogisticRegression,0.78,0.78,0.78,0.78,0.02
BaggingClassifier,0.78,0.78,0.78,0.78,0.25
LinearSVC,0.78,0.78,0.78,0.78,0.03


In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

X = df.drop('Transported', axis=1)
y = df.loc[:, 'Transported']

num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
num_imputer = SimpleImputer(strategy='median')

X[num_cols] = num_imputer.fit_transform(X[num_cols])

cat_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']
cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')

X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

X = update_dataset(X)


categorical_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_deck', 'cabin_side']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

for col in categorical_cols:
    X[col] = X[col].astype(str)

column_transformer = ColumnTransformer(transformers=[
    ('num', MinMaxScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', dtype=int), categorical_cols)
])

X_processed = column_transformer.fit_transform(X)

cat_feature_names = column_transformer.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols + list(cat_feature_names)

X_train_final = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed,
                             columns=all_feature_names)

In [12]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

model = LGBMClassifier()
model.fit(X_train_final, y_train)
y_pred_test = model.predict(X_test_final)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000530 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1909
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230


In [13]:
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

Accuracy: 0.8027602070155262
              precision    recall  f1-score   support

       False       0.81      0.79      0.80       861
        True       0.80      0.81      0.81       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



In [14]:
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.00,False,0.00,0.00,0.00,0.00,0.00,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.00,False,0.00,9.00,0.00,2823.00,0.00,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.00,False,0.00,0.00,0.00,0.00,0.00,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.00,False,0.00,6652.00,0.00,181.00,585.00,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.00,False,10.00,0.00,635.00,0.00,0.00,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.00,False,0.00,0.00,0.00,0.00,0.00,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.00,False,0.00,847.00,17.00,10.00,144.00,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.00,0.00,0.00,0.00,0.00,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.00,2680.00,0.00,0.00,523.00,Kitakan Conale


In [15]:
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
test_df[num_cols] = num_imputer.transform(test_df[num_cols])

cat_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']
test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

In [16]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

test_df = update_dataset(test_df)

categorical_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_deck', 'cabin_side']
numerical_cols = [col for col in test_df.columns if col not in categorical_cols]

for col in categorical_cols:
    test_df[col] = X_train_cleaned[col].astype(str)

test_df_processed = column_transformer.transform(test_df)

cat_feature_names = column_transformer.named_transformers_['cat'].get_feature_names_out(categorical_cols)
all_feature_names = numerical_cols + list(cat_feature_names)


test_df_final = pd.DataFrame(test_df_processed.toarray() if hasattr(test_df_processed, 'toarray') else test_df_processed,
                            columns=all_feature_names)

In [17]:
test_df_final

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,cabin_num,passenger_gggg,passenger_pp,HomePlanet_Europa,...,cabin_deck_B,cabin_deck_C,cabin_deck_D,cabin_deck_E,cabin_deck_F,cabin_deck_G,cabin_deck_T,cabin_deck_U,cabin_side_S,cabin_side_U
0,0.34,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,0.24,0.00,0.00,0.00,0.15,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00
2,0.39,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
3,0.48,0.00,0.24,0.00,0.01,0.02,0.00,0.00,0.00,1.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00
4,0.25,0.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,1.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0.43,0.00,0.00,0.00,0.00,0.00,0.79,1.00,0.14,0.00,...,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00
4273,0.53,0.00,0.03,0.00,0.00,0.01,0.00,1.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4274,0.34,0.00,0.00,0.00,0.00,0.00,0.16,1.00,0.00,0.00,...,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,0.00
4275,0.34,0.00,0.10,0.00,0.00,0.02,0.16,1.00,0.00,1.00,...,0.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [18]:
y_pred = model.predict(test_df_final)
y_pred

array([False, False,  True, ..., False,  True,  True])

In [19]:
test_df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,cabin_deck,cabin_num,cabin_side,passenger_gggg,passenger_pp
0,,,,27.00,,0.00,0.00,0.00,0.00,0.00,,3,,13,1
1,Earth,False,TRAPPIST-1e,19.00,False,0.00,9.00,0.00,2823.00,0.00,F,4,S,18,1
2,Europa,False,TRAPPIST-1e,31.00,True,0.00,0.00,0.00,0.00,0.00,A,0,S,19,1
3,Europa,False,TRAPPIST-1e,38.00,False,0.00,6652.00,0.00,181.00,585.00,A,1,S,21,1
4,Earth,False,TRAPPIST-1e,20.00,False,10.00,0.00,635.00,0.00,0.00,F,5,S,23,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,Earth,False,TRAPPIST-1e,34.00,False,0.00,0.00,0.00,0.00,0.00,E,1496,S,9266,2
4273,,,,42.00,,0.00,847.00,17.00,10.00,144.00,,-1,,9269,1
4274,Mars,False,TRAPPIST-1e,27.00,False,0.00,0.00,0.00,0.00,0.00,F,296,P,9271,1
4275,Europa,False,PSO J318.5-22,27.00,True,0.00,2680.00,0.00,0.00,523.00,C,297,P,9273,1


In [20]:
test_df = pd.read_csv('test.csv')

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported' : y_pred.astype(bool)
})
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,True


In [21]:
submission.to_csv('lightgbm-easyshit-2.csv', index=False)