In [1]:
# uncomment for the first run
# !pip install kaggle
# !kaggle competitions download -c spaceship-titanic
# !unzip space*

In [6]:
import numpy as np
# preprocess
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# model
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# eval
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, make_scorer

In [7]:
df = pd.read_csv('train.csv')

In [8]:
df[:2]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


# XGBoost

In [10]:
# Replace 'your_dataframe.csv' with the actual file path or use your DataFrame directly if it's already loaded.
df = pd.read_csv('train.csv')

# Drop the 'PassengerId' and 'Name' columns as they are unlikely to have predictive value.
df.drop(['PassengerId', 'Name'], axis=1, inplace=True)


# Handling missing values (you can choose other strategies based on your dataset).
# df.fillna(0, inplace=True)
for col in df.columns:
    if df[col].dtype == 'object':
        # For object (categorical) columns, impute with the most common value (mode)
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        # For numeric columns, impute with the mean (you can choose other methods like median as well)
        df[col].fillna(df[col].mean(), inplace=True)


# Convert 'CryoSleep', 'VIP', and 'Transported' columns to numeric (1 for True, 0 for False).
df['CryoSleep'] = df['CryoSleep'].map({'True': 1, 'False': 0})
df['VIP'] = df['VIP'].map({'True': 1, 'False': 0})
df['Transported'] = df['Transported'].astype(int)

# Extract categorical columns and encode them using one-hot encoding.
categorical_columns = ['HomePlanet', 'Cabin', 'Destination']
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_categorical = encoder.fit_transform(df[categorical_columns])
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_columns))


# Drop the original categorical columns and concatenate the encoded ones.
df.drop(categorical_columns, axis=1, inplace=True)
df = pd.concat([df, encoded_categorical_df], axis=1)


# Split the data into features (X) and target (y).
X = df.drop('Transported', axis=1)
y = df['Transported']


# Split the data into training and testing sets.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Convert data to DMatrix format for XGBoost.
dtrain = xgb.DMatrix(X_train, label=y_train)

# Set the hyperparameters for the XGBoost model.
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 3,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
    # 'tree_method': 'gpu_hist'
}

# Train the XGBoost model.
num_rounds = 150
model = xgb.train(params, dtrain, num_rounds)


In [14]:
# Predict using the test data.
dval = xgb.DMatrix(X_val)
y_pred = model.predict(dval)

# Convert the probabilities to binary predictions (0 or 1).
y_pred_binary = np.round(y_pred)

# Calculate the accuracy of the model.
accuracy = accuracy_score(y_val, y_pred_binary)
print(f'Accuracy: {accuracy:.5f}')

Accuracy: 0.77746


In [14]:
import joblib

In [17]:
# Save the learned parameters and the Random Forest estimator using joblib
joblib.dump(model, 'xgb_base.pkl')
# Load the saved Random Forest model
xgb_reload = joblib.load('xgb_base.pkl')
# Use the loaded model for predictions


y_pred_reload = xgb_reload.predict(dval)

all(y_pred_reload == y_pred)

True

# Tuning parameters

In [15]:
# Define the parameter search space.
param_space = {
    'max_depth': np.arange(3, 10),
    'learning_rate': np.logspace(-3, 0, 100),
    'subsample': np.linspace(0.6, 0.9, 4),
    'colsample_bytree': np.linspace(0.6, 0.9, 4)
}



xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', seed=42)

# Perform randomized search with cross-validation.
random_search = RandomizedSearchCV(
    xgb_classifier,
    param_distributions=param_space,
    n_iter=5,  # Number of random parameter combinations to try
    scoring=make_scorer(accuracy_score),  # Use accuracy as the evaluation metric
    cv=5,  # 5-fold cross-validation
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Perform the search on the training data.
random_search.fit(X_train, y_train)

# Get the best hyperparameters found by the search.
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model and evaluate it on the test data.
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Test Accuracy: {accuracy:.2f}')


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Hyperparameters: {'subsample': 0.8, 'max_depth': 7, 'learning_rate': 0.06135907273413173, 'colsample_bytree': 0.8}
Test Accuracy: 0.78


Took forever to run 25 fits. Need speedup! 