In [1]:
# uncomment for the first run
# !pip install kaggle
# !kaggle competitions download -c spaceship-titanic
# !unzip space*

In [2]:
# preprocess
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# model
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# eval
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df[:2]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


# Boosting:

In [6]:
# Step 1: Handling missing values by imputing with the most common value (mode)
for col in df.columns:
    if df[col].dtype == 'object':
        # For object (categorical) columns, impute with the most common value (mode)
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        # For numeric columns, impute with the mean (you can choose other methods like median as well)
        df[col].fillna(df[col].mean(), inplace=True)

# Step 2: Split the data into features (X) and target (y) which needs to be integer for gradient boosting model
X = df.drop('Transported', axis=1)
y = df['Transported']
y = y.astype(int)

# Step 3: Encoding categorical variables (if any)
# If there are categorical variables in your data, you need to encode them as numerical values
# For simplicity, we'll use LabelEncoder for this example
# You may consider using OneHotEncoder for nominal categorical variables or other encoders for specific cases
label_encoder = LabelEncoder()
for col in X.select_dtypes(include='object').columns:
    X[col] = label_encoder.fit_transform(X[col])

# Step 4: Split the data into training and dev sets
# We will use 80% of the data for training and 20% for dev
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Step 5: Instantiate the gradient boosting classifier
# For simplicity, we'll use default hyperparameters
gbc = GradientBoostingClassifier(random_state=0)

# Step 6: Train the Random Forest classifier
gbc.fit(X_train, y_train)

# Step 7: Make predictions on the dev data
y_pred = gbc.predict(X_dev)

# Step 8: Evaluate the performance of the Random Forest model
accuracy = accuracy_score(y_dev, y_pred)
print("Accuracy:", accuracy)

# You can also print other metrics like classification report and confusion matrix
# Classification Report
print("\nClassification Report:")
print(classification_report(y_dev, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_dev, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Accuracy: 0.7929844738355377

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.75      0.78       861
           1       0.77      0.83      0.80       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739


Confusion Matrix:
[[647 214]
 [146 732]]


In [8]:
# load the test data
X_test = pd.read_csv('test.csv')

# Step 1: handle missing values the same way as we did for the training+dev set
for col in X_test.columns:
    if X_test[col].dtype == 'object':
        X_test[col].fillna(X_test[col].mode()[0], inplace=True)
    else:
        X_test[col].fillna(X_test[col].mean(), inplace=True)

# Step 2: Encode categorical variables, same encoder as for training dev splits
for col in X_test.select_dtypes(include='object').columns:
    X_test[col] = label_encoder.fit_transform(X_test[col])

# Step 3: Make predictions on the unseen data using the trained model
y_pred = gbc.predict(X_test)
y_pred = y_pred.astype(bool)

# log the prediction in the submission file
df_submission = pd.read_csv('sample_submission.csv')
df_submission['Transported'] = y_pred
df_submission.to_csv('submission_gbbase.csv', index=False)

In [17]:
# !kaggle competitions submit -c spaceship-titanic -f submission_gbbase.csv -m "gradient boosting baseline"
# !kaggle competitions submissions -c spaceship-titanic

## hpyerparameter search

In [10]:
# only done random search. can do grid search as well by commenting out the relevant lines


# Step 3: Define the hyperparameter grid for Grid Search or Randomized Search
param_grid = {
    'n_estimators': [100, 200, 300],             # Number of boosting stages to be run
    'learning_rate': [0.01, 0.1, 0.2],           # Step size shrinkage used in updating weights
    'max_depth': [3, 5, 7],                      # Maximum depth of the individual trees
    'min_samples_split': [2, 5, 10],             # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],               # Minimum samples required to be at a leaf node
    # You can add more hyperparameters to the grid based on your requirement
}


# # Step 4: Perform Grid Search
# grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Step 5: Get the best hyperparameters and corresponding model
# best_params_grid = grid_search.best_params_
# best_rf_model_grid = grid_search.best_estimator_

# Step 6: Perform Randomized Search
randomized_search = RandomizedSearchCV(gbc, param_distributions=param_grid, n_iter=10, cv=5, n_jobs=-1)
randomized_search.fit(X_train, y_train)

# Step 7: Get the best hyperparameters and corresponding model from Randomized Search
best_params_random = randomized_search.best_params_
best_gbc_random = randomized_search.best_estimator_

# Step 8: Evaluate the best models on the test data
# accuracy_grid = best_rf_model_grid.score(X_dev, y_dev)
accuracy_random = best_gbc_random.score(X_dev, y_dev)

# print("Best Hyperparameters from Grid Search:", best_params_grid)
# print("Best Model Accuracy from Grid Search:", accuracy_grid)

print("Best Hyperparameters from Randomized Search:", best_params_random)
print("Best Model Accuracy from Randomized Search:", accuracy_random)


Best Hyperparameters from Randomized Search: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 3, 'learning_rate': 0.2}
Best Model Accuracy from Randomized Search: 0.7981598619896493


In [11]:
# inference on the test data (already processed earlier)
y_pred = best_gbc_random.predict(X_test)
y_pred = y_pred.astype(bool)
# log predictions in the submision file
df_submission = pd.read_csv('sample_submission.csv')
df_submission['Transported'] = y_pred
df_submission.to_csv('submission_gbtuned.csv', index=False)

In [13]:
# !kaggle competitions submit -c spaceship-titanic -f submission_gbtuned.csv -m "gradient boosting tuned"
# !kaggle competitions submissions -c spaceship-titanic

# Save the model

In [15]:
import joblib

In [16]:
# Save the learned parameters and the estimator using joblib
joblib.dump(gbc, 'gbc_tuned.pkl')
# Load the model
gbc_load = joblib.load('gbc_tuned.pkl')
# Use the loaded model for predictions
y_pred_reload = gbc_load.predict(X_test)

# all(y_pred_reload == y_pred)