In [1]:
import gender_guesser.detector as gender
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from catboost import CatBoostClassifier, Pool
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

gd = gender.Detector()

**Load the Training Data**

In [2]:
df = pd.read_csv('kaggle/input/spaceship-titanic/train.csv')
print(df.isna().sum())
print(df.columns)

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64
Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')


**Feature Extraction and Manual Imputing**

In [3]:
def fe_and_mi(df):
    # split the PassengerID into Group and Member
    df[['Group', 'Member']] = df['PassengerId'].str.split('_', expand=True)

    # count how many members are in a passengers group
    df['TotalGroupSize'] = df.groupby('Group')['PassengerId'].transform('size')

    # if the passenger is in CryoSleep fill any empty values involving money to 0
    columns_to_update = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for column in columns_to_update:
        df.loc[df['CryoSleep'] & df[column].isna(), column] = 0
        
    # create a total expense per passenger
    df['TotalExpense'] = df[columns_to_update].sum(axis=1)

    # Split 'Name' into 'LastName' and 'FirstName' only if 'Name' is not empty
    df.loc[df['Name'].notna(), 'FirstName'] = df.loc[df['Name'].notna(), 'Name'].str.split(' ', expand=True)[0]
    df.loc[df['Name'].notna(), 'LastName'] = df.loc[df['Name'].notna(), 'Name'].str.split(' ', expand=True)[1]

    # Assign a gender to the passengers based on their first name
    df['Gender'] = df['Name'].str.split(' ', expand=True)[0].apply(lambda x: gd.get_gender(x))

    # split the Cabin into Deck Room and Side
    df[['Deck', 'Room', 'Side']] = df['Cabin'].str.split('/', expand=True)

    df = df.drop(['PassengerId', 'Cabin', 'Name'], axis=1)
    
    return df

df = fe_and_mi(df)

In [608]:
df.head(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,Group,Member,TotalGroupSize,TotalExpense,FirstName,LastName,Gender,Deck,Room,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,...,1,1,1,0.0,Maham,Ofracculy,unknown,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,...,2,1,1,736.0,Juanna,Vines,unknown,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,...,3,1,2,10383.0,Altark,Susent,unknown,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,...,3,2,2,5176.0,Solam,Susent,unknown,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,...,4,1,1,1091.0,Willy,Santantines,male,F,1,S


In [4]:
rows_with_missing_data = df.isna().any(axis=1).sum()
print(f"{rows_with_missing_data} of {len(df)} rows have missing data which represents {rows_with_missing_data / len(df) * 100}%")

df_cleaned = df.dropna()
df_cleaned.dtypes

1786 of 8693 rows have missing data which represents 20.545266306223397%


HomePlanet         object
CryoSleep          object
Destination        object
Age               float64
VIP                object
RoomService       float64
FoodCourt         float64
ShoppingMall      float64
Spa               float64
VRDeck            float64
Transported          bool
Group              object
Member             object
TotalGroupSize      int64
TotalExpense      float64
FirstName          object
LastName           object
Gender             object
Deck               object
Room               object
Side               object
dtype: object

**Setup the columns so that we can setup the proper types and also let catboost know which columns are categorical**

In [610]:
categorical_columns = ['HomePlanet', 'Destination', 'FirstName', 'LastName', 'Deck', 'Side', 'Member', 'Group', 'Gender']
bool_columns = ['CryoSleep', 'VIP', 'Transported']
numerical_columns = ['Age', 'TotalGroupSize', 'Room', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalExpense']

## Training

**Set up the Training Data**

In [611]:
X = df_cleaned.copy()

scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

X = X[categorical_columns + numerical_columns + bool_columns]
X[categorical_columns] = X[categorical_columns].astype('category')
X[bool_columns] = X[bool_columns].astype('bool')
X[numerical_columns] = X[numerical_columns].astype(int)

y = X['Transported']
y = y.astype(int)
X = X.drop('Transported', axis=1)

X.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = Pool(data=X_train, label=y_train, cat_features=categorical_columns)
test_data = Pool(data=X_test, label=y_test, cat_features=categorical_columns)

**Baseline CatBoostClassifier**

In [612]:
model = CatBoostClassifier()

model.fit(train_data, eval_set=test_data)

preds_class = model.predict(test_data)
preds_proba = model.predict(test_data, prediction_type='Probability')
print(preds_class.shape)
preds_class = [x == 'True' for x in preds_class]
acc = accuracy_score(y_test, preds_class)
print(f"Accuracy: {acc}")

Learning rate set to 0.048335
0:	learn: 0.6705243	test: 0.6734824	best: 0.6734824 (0)	total: 31.4ms	remaining: 31.4s
1:	learn: 0.6453734	test: 0.6511283	best: 0.6511283 (1)	total: 41.6ms	remaining: 20.7s
2:	learn: 0.6255948	test: 0.6341257	best: 0.6341257 (2)	total: 53.4ms	remaining: 17.7s
3:	learn: 0.6153402	test: 0.6243788	best: 0.6243788 (3)	total: 59.3ms	remaining: 14.8s
4:	learn: 0.6018122	test: 0.6119434	best: 0.6119434 (4)	total: 65.3ms	remaining: 13s
5:	learn: 0.5890378	test: 0.6000435	best: 0.6000435 (5)	total: 71.7ms	remaining: 11.9s
6:	learn: 0.5789354	test: 0.5912337	best: 0.5912337 (6)	total: 77.9ms	remaining: 11.1s
7:	learn: 0.5659068	test: 0.5792169	best: 0.5792169 (7)	total: 83.5ms	remaining: 10.4s
8:	learn: 0.5576788	test: 0.5711400	best: 0.5711400 (8)	total: 89.4ms	remaining: 9.85s
9:	learn: 0.5494645	test: 0.5638017	best: 0.5638017 (9)	total: 96.1ms	remaining: 9.51s
10:	learn: 0.5417246	test: 0.5568155	best: 0.5568155 (10)	total: 102ms	remaining: 9.17s
11:	learn: 0.5

**HyperOPT**

In [613]:
def objective(params):
    params = {
        'learning_rate': params['learning_rate'],
        'depth': int(params['depth']),
        'iterations': int(params['iterations']),
        'l2_leaf_reg': params['l2_leaf_reg']
    }
    
    clf = CatBoostClassifier(cat_features=categorical_columns, **params, logging_level='Verbose')
    
    score = cross_val_score(clf, X, y, cv=5, scoring='accuracy').mean()
    return {'loss': -score, 'status': STATUS_OK}

space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'depth': hp.quniform('depth', 4, 12, 1),
    'iterations': hp.quniform('iterations', 100, 1000, 50),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 9)
}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10,
            trials=trials)

print(f"Best hyperparameters:\n {best}")

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]0:	learn: 0.6309948	total: 69.3ms	remaining: 51.9s
1:	learn: 0.5962559	total: 96.4ms	remaining: 36.1s
2:	learn: 0.5685330	total: 119ms	remaining: 29.7s
3:	learn: 0.5436451	total: 135ms	remaining: 25.2s
4:	learn: 0.5297147	total: 141ms	remaining: 20.9s
5:	learn: 0.5130007	total: 152ms	remaining: 18.8s
6:	learn: 0.5045016	total: 158ms	remaining: 16.8s
7:	learn: 0.4913318	total: 170ms	remaining: 15.8s
8:	learn: 0.4780485	total: 181ms	remaining: 14.9s
9:	learn: 0.4740430	total: 186ms	remaining: 13.8s
10:	learn: 0.4662721	total: 196ms	remaining: 13.2s
11:	learn: 0.4574618	total: 205ms	remaining: 12.6s
12:	learn: 0.4517158	total: 215ms	remaining: 12.2s
13:	learn: 0.4456755	total: 225ms	remaining: 11.8s
14:	learn: 0.4399654	total: 243ms	remaining: 11.9s
15:	learn: 0.4361739	total: 258ms	remaining: 11.8s
16:	learn: 0.4336306	total: 271ms	remaining: 11.7s
17:	learn: 0.4301314	total: 284ms	remaining: 11.6s
18:	learn: 0.4270542	total: 296ms	re

In [614]:
hyperopt_catboost_model = CatBoostClassifier(**best, cat_features=categorical_columns, verbose=2)  # verbose=0 suppresses the output during training
hyperopt_catboost_model.fit(X_train, y_train)

# Predict on test set
y_pred_hyperopt_catboost = hyperopt_catboost_model.predict(X_test)

# Calculate accuracy
as_hyperopt_catboost = accuracy_score(y_test, y_pred_hyperopt_catboost)

print(f"Accuracy Score: {as_hyperopt_catboost}")

0:	learn: 0.6729667	total: 8.7ms	remaining: 8.25s
2:	learn: 0.6391286	total: 18.8ms	remaining: 5.92s
4:	learn: 0.6145353	total: 26.4ms	remaining: 5s
6:	learn: 0.5964982	total: 32.4ms	remaining: 4.37s
8:	learn: 0.5777721	total: 38.5ms	remaining: 4.03s
10:	learn: 0.5644981	total: 46.1ms	remaining: 3.94s
12:	learn: 0.5545440	total: 53.5ms	remaining: 3.85s
14:	learn: 0.5441042	total: 61ms	remaining: 3.8s
16:	learn: 0.5353286	total: 68ms	remaining: 3.73s
18:	learn: 0.5277612	total: 75.6ms	remaining: 3.7s
20:	learn: 0.5218610	total: 83.1ms	remaining: 3.68s
22:	learn: 0.5159654	total: 92ms	remaining: 3.71s
24:	learn: 0.5111178	total: 99.8ms	remaining: 3.69s
26:	learn: 0.5071157	total: 107ms	remaining: 3.66s
28:	learn: 0.5029867	total: 114ms	remaining: 3.62s
30:	learn: 0.4996350	total: 123ms	remaining: 3.64s
32:	learn: 0.4968659	total: 129ms	remaining: 3.59s
34:	learn: 0.4935165	total: 137ms	remaining: 3.57s
36:	learn: 0.4911587	total: 143ms	remaining: 3.54s
38:	learn: 0.4885522	total: 150ms	r

**Ensemble with a logistic regressor**

Search for optimal hyper params

In [615]:
# Encode the categorical features in X_train
encoder_dict = {}
def encode(df):
    encoder = LabelEncoder()
    for col in categorical_columns:
        df[col] = encoder.fit_transform(df[col])
        encoder_dict[col] = encoder
    return df

encoded_X_train = encode(X_train.copy())
encoded_X_test = encode(X_test.copy())

# Step 1: Define the hyperparameters to tune
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [1, 5, 10],
    'max_leaf_nodes': [None, 10, 20, 50],
    'l2_regularization': [0.0, 0.01, 0.001],
    'random_state': [42],  # Set the random state for reproducibility
}

# Step 2: Create the HistGradientBoostingClassifier
hist_gradient_boosting = HistGradientBoostingClassifier()

# Step 3: Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(hist_gradient_boosting, param_grid, cv=5, scoring='accuracy')

# Step 4: Make predictions using the hyperopt_catboost_model on X_test
cb_pred = hyperopt_catboost_model.predict_proba(X_train)[:, 1]
cb_pred_test = hyperopt_catboost_model.predict_proba(X_test)[:, 1]

# Step 5: Create the stacking_data_train with CatBoost_Predictions
stacking_data_train = pd.DataFrame({
    'CatBoost_Prediction': cb_pred,
    **encoded_X_train,  # Use the encoded_X_test as you already did in Block 1
    'Age': X_train['Age'],
    'TotalGroupSize': X_train['TotalGroupSize'],
    'RoomService': X_train['RoomService'],
    'FoodCourt': X_train['FoodCourt'],
    'ShoppingMall': X_train['ShoppingMall'],
    'Spa': X_train['Spa'],
    'VRDeck': X_train['VRDeck'],
    'TotalExpense': X_train['TotalExpense'],
    'CryoSleep': X_train['CryoSleep'],
    'VIP': X_train['VIP'],
})

stacking_data_test = pd.DataFrame({
    'CatBoost_Prediction': cb_pred_test,
    **encoded_X_test,
    'Age': X_test['Age'],
    'TotalGroupSize': X_test['TotalGroupSize'],
    'RoomService': X_test['RoomService'],
    'FoodCourt': X_test['FoodCourt'],
    'ShoppingMall': X_test['ShoppingMall'],
    'Spa': X_test['Spa'],
    'VRDeck': X_test['VRDeck'],
    'TotalExpense': X_test['TotalExpense'],
    'CryoSleep': X_test['CryoSleep'],
    'VIP': X_test['VIP'],
})

# Step 6: Fit the GridSearchCV to your training data and labels
grid_search.fit(stacking_data_train, y_train)  # Use X_train and y_train here

# Step 7: Access the best hyperparameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Step 8: Make predictions using the best model on X_test
ensemble_predictions = best_model.predict(stacking_data_test)  # Use X_test here
as_catboost_ensemble = accuracy_score(y_test, ensemble_predictions)  # Evaluate on the training data

print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy: {as_catboost_ensemble}")

Best Hyperparameters: {'l2_regularization': 0.0, 'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 200, 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'random_state': 42}
Accuracy: 0.7293777134587555


Build Ensemble Model

In [616]:
# Predict probabilities on X_test
cb_pred = hyperopt_catboost_model.predict_proba(X_test)[:, 1]
print(cb_pred)

# Encode the categorical features in X_test
encoded_X_test = X_test.copy()
encoded_X_test = encode(encoded_X_test)        

# Stack the validation data
stacking_data_val = pd.DataFrame({
    'CatBoost_Prediction': cb_pred,
    **encoded_X_test,
    'Age': X_test['Age'],
    'TotalGroupSize': X_test['TotalGroupSize'],
    'RoomService': X_test['RoomService'],
    'FoodCourt': X_test['FoodCourt'],
    'ShoppingMall': X_test['ShoppingMall'],
    'Spa': X_test['Spa'],
    'VRDeck': X_test['VRDeck'],
    'TotalExpense': X_test['TotalExpense'],
    'CryoSleep': X_test['CryoSleep'],
    'VIP': X_test['VIP'],
})

# Train the meta model on the training set
stacking_data_train = pd.DataFrame({
    'CatBoost_Prediction': hyperopt_catboost_model.predict_proba(X_train)[:, 1],
    **encoded_X_train,
    'Age': X_train['Age'],
    'TotalGroupSize': X_train['TotalGroupSize'],
    'RoomService': X_train['RoomService'],
    'FoodCourt': X_train['FoodCourt'],
    'ShoppingMall': X_train['ShoppingMall'],
    'Spa': X_train['Spa'],
    'VRDeck': X_train['VRDeck'],
    'TotalExpense': X_train['TotalExpense'],
    'CryoSleep': X_train['CryoSleep'],
    'VIP': X_train['VIP'],
})

# Train the meta model
meta_model = HistGradientBoostingClassifier(**best_params)
meta_model.fit(stacking_data_train, y_train)

# Make predictions on the test data
ensemble_predictions = meta_model.predict(stacking_data_val)
as_catboost_ensemble = accuracy_score(y_test, ensemble_predictions)

print(f"Accuracy Score: {as_catboost_ensemble}")

stacking_data_val.head()

[0.69828223 0.77059535 0.22943361 ... 0.96250093 0.22001337 0.90438896]
Accuracy Score: 0.7293777134587555


Unnamed: 0,CatBoost_Prediction,HomePlanet,Destination,FirstName,LastName,Deck,Side,Member,Group,Gender,...,TotalGroupSize,Room,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalExpense,CryoSleep,VIP
3132,0.698282,0,2,739,779,6,0,1,459,5,...,0,0,0,0,0,0,0,0,True,False
1276,0.770595,2,2,315,889,5,1,0,181,5,...,1,0,0,0,0,0,0,0,False,False
4943,0.229434,0,2,738,296,4,1,0,728,5,...,0,0,0,0,0,0,0,0,False,False
7215,0.065323,0,1,840,882,5,0,0,1044,5,...,0,1,3,0,0,0,0,0,False,False
6936,0.204379,0,0,639,588,5,0,0,999,2,...,0,1,0,0,0,0,0,0,False,False


## Prepare to run the prediction set

**Load the values that we will predict for**

In [617]:
df_predict = pd.read_csv('kaggle/input/spaceship-titanic/test.csv')
df_predict.isna().sum()

df_pid = pd.DataFrame()
df_pid['PassengerId'] = df_predict['PassengerId']

**Do an initial transformation of the data to create features and impute data where we can**

In [618]:
df_predict = fe_and_mi(df_predict)
df_predict.isna().sum()

HomePlanet         87
CryoSleep          93
Destination        92
Age                91
VIP                93
RoomService        57
FoodCourt          67
ShoppingMall       63
Spa                59
VRDeck             51
Group               0
Member              0
TotalGroupSize      0
TotalExpense        0
FirstName          94
LastName           94
Gender              0
Deck              100
Room              100
Side              100
dtype: int64

**Impute with KNNImputer**

In [619]:
def impute_data(df, categorical_columns, numerical_columns, bool_columns):
    # Initialize the imputer for numerical columns
    numerical_imputer = KNNImputer(n_neighbors=5)

    # Impute missing values in the numerical part
    df_numerical_imputed = pd.DataFrame(numerical_imputer.fit_transform(df[numerical_columns]), columns=numerical_columns)

    # Initialize the imputer for categorical columns
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    # Impute missing values in the categorical part
    df_categorical_imputed = pd.DataFrame(categorical_imputer.fit_transform(df[categorical_columns]), columns=categorical_columns)

    # Combine the imputed categorical part with the imputed numeric part
    df_imputed = pd.concat([df_categorical_imputed, df_numerical_imputed, df[bool_columns]], axis=1)

    # Convert 'CryoSleep' and 'VIP' back to boolean datatype
    for col in bool_columns:
        df_imputed[col] = df_imputed[col].astype(bool)

    return df_imputed

# ... (rest of your code) ...

# remove 'Transported' column from bool_columns as it is not in our test set
if 'Transported' in bool_columns:
    bool_columns.remove('Transported')

# Call the knnimpute_data function with the correct columns
df_predict = impute_data(df_predict, categorical_columns, numerical_columns, bool_columns)

print(df_predict.isna().sum())
df_predict.head()

HomePlanet        0
Destination       0
FirstName         0
LastName          0
Deck              0
Side              0
Member            0
Group             0
Gender            0
Age               0
TotalGroupSize    0
Room              0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
TotalExpense      0
CryoSleep         0
VIP               0
dtype: int64


Unnamed: 0,HomePlanet,Destination,FirstName,LastName,Deck,Side,Member,Group,Gender,Age,TotalGroupSize,Room,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalExpense,CryoSleep,VIP
0,Earth,TRAPPIST-1e,Nelly,Carsoning,G,S,1,13,female,27.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False
1,Earth,TRAPPIST-1e,Lerome,Peckers,F,S,1,18,unknown,19.0,1.0,4.0,0.0,9.0,0.0,2823.0,0.0,2832.0,False,False
2,Europa,55 Cancri e,Sabih,Unhearfus,C,S,1,19,unknown,31.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False
3,Europa,TRAPPIST-1e,Meratz,Caltilter,C,S,1,21,unknown,38.0,1.0,1.0,0.0,6652.0,0.0,181.0,585.0,7418.0,False,False
4,Earth,TRAPPIST-1e,Brence,Harperez,F,S,1,23,unknown,20.0,1.0,5.0,10.0,0.0,635.0,0.0,0.0,645.0,False,False


In [620]:
X_pred = df_predict.copy()

scaler = StandardScaler()
X_pred[numerical_columns] = scaler.fit_transform(X_pred[numerical_columns])

X_pred = X_pred[categorical_columns + numerical_columns + bool_columns]
X_pred[categorical_columns] = X_pred[categorical_columns].astype('category')
X_pred[bool_columns] = X_pred[bool_columns].astype('bool')
X_pred[numerical_columns] = X_pred[numerical_columns].astype(int)

X_pred.isna().sum()

HomePlanet        0
Destination       0
FirstName         0
LastName          0
Deck              0
Side              0
Member            0
Group             0
Gender            0
Age               0
TotalGroupSize    0
Room              0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
TotalExpense      0
CryoSleep         0
VIP               0
dtype: int64

**Ensemble Model**

In [621]:
cb_pred = hyperopt_catboost_model.predict_proba(X_pred)[:, 1]
print(cb_pred)

encoded_X_pred = X_pred.copy()

encoder_dict = {}
def encode(df):
    encoder = LabelEncoder()
    for col in categorical_columns:
        df[col] = encoder.fit_transform(df[col])
        encoder_dict[col] = encoder
    return df

encoded_X_pred = encode(encoded_X_pred)

print(encoded_X_pred.shape)
print(X_pred.shape)
print(encoded_X_pred.columns)

stacking_data_val = pd.DataFrame({
    'CatBoost_Prediction': cb_pred,
    **encoded_X_pred,
    'Age': X_pred['Age'],
    'TotalGroupSize': X_pred['TotalGroupSize'],
    'RoomService': X_pred['RoomService'],
    'FoodCourt': X_pred['FoodCourt'],
    'ShoppingMall': X_pred['ShoppingMall'],
    'Spa': X_pred['Spa'],
    'VRDeck': X_pred['VRDeck'],
    'TotalExpense': X_pred['TotalExpense'],
    'CryoSleep': X_pred['CryoSleep'],
    'VIP': X_pred['VIP'],
})

print(stacking_data_val.shape)

# make predictions on the test data
ensemble_predictions = meta_model.predict(stacking_data_val).astype(bool)

print(ensemble_predictions)


[0.70638639 0.01599904 0.9922669  ... 0.91970351 0.70860449 0.74633618]
(4277, 20)
(4277, 20)
Index(['HomePlanet', 'Destination', 'FirstName', 'LastName', 'Deck', 'Side',
       'Member', 'Group', 'Gender', 'Age', 'TotalGroupSize', 'Room',
       'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'TotalExpense', 'CryoSleep', 'VIP'],
      dtype='object')
(4277, 21)
[ True False  True ...  True  True  True]


In [622]:
sub_df = pd.DataFrame()
sub_df['PassengerId'] = df_pid['PassengerId']
sub_df['Transported'] = ensemble_predictions
sub_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [623]:
sub_df.to_csv('submission.csv', index=False)