In [1]:
# uncomment for the first run
# !pip install kaggle
# !kaggle competitions download -c spaceship-titanic
# !unzip spaceship-titanic

In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
PATH = './data/raw'

In [4]:
df = pd.read_csv(os.path.join(PATH,'train.csv'))

In [5]:
df[:2]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


# Baseline:
## random forest with default parameters

In [7]:
# step 1 impute missing data
def impute(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            # For object (categorical) columns, impute with the most common value (mode)
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            # For numeric columns, impute with the mean
            df[col].fillna(df[col].mean(), inplace=True)
    return df

df = impute(df)

In [8]:
# step 2: Split the data into features (X) and target (y)
X = df.drop('Transported', axis=1)
y = df['Transported']

In [9]:
# Step 3: Encoding categorical variables
# For simplicity, we'll use LabelEncoder for this example

def encode(X,encoder):
    for col in X.select_dtypes(include='object').columns:
        X[col] = label_encoder.fit_transform(X[col])
    return X

label_encoder = LabelEncoder()

X = encode(X,label_encoder)

In [10]:
# Step 4: Split the data into training and dev sets
# We will use 80% of the data for training and 20% for dev

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
# Step 5: Instantiate the Random Forest classifier
# You can set various hyperparameters of the Random Forest here
# For simplicity, we'll use default hyperparameters
random_forest = RandomForestClassifier(random_state=0)

# Step 6: Train the Random Forest classifier
random_forest.fit(X_train, y_train)

# Step 7: Make predictions on the dev data
y_pred = random_forest.predict(X_dev)

# Step 8: Evaluate the performance of the Random Forest model
accuracy = accuracy_score(y_dev, y_pred)
print("Accuracy:", accuracy)

# You can also print other metrics like classification report and confusion matrix
# Classification Report
print("\nClassification Report:")
print(classification_report(y_dev, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_dev, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Accuracy: 0.7901092581943646

Classification Report:
              precision    recall  f1-score   support

       False       0.79      0.79      0.79       863
        True       0.79      0.79      0.79       876

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739


Confusion Matrix:
[[684 179]
 [186 690]]


In [12]:
# load the test data
X_test = pd.read_csv(os.path.join(PATH,'test.csv'))

# Step 1: handle missing values the same way as we did for the training+dev set
X_test = impute(X_test)

# Step 2: Encode categorical variables, same encoder as for training dev splits
X_test = encode(X_test, label_encoder)

In [13]:

# Step 3: Make predictions on the unseen data using the trained model
y_pred = random_forest.predict(X_test)

# set path for submission data
# PATH_SUB = './data'
# log the prediction in the submission file
df_submission = pd.read_csv(os.path.join(PATH,'sample_submission.csv'))
df_submission['Transported'] = y_pred
df_submission.to_csv('./data/submission_rfbase.csv', index=False)

In [14]:
# !kaggle competitions submit -c spaceship-titanic -f sample_submission.csv -m "random forest baseline"
# !kaggle competitions submissions -c spaceship-titanic

## hpyerparameter search

In [15]:
# only done random search. can do grid search as well by commenting out the relevant lines

# # Step 1: Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Instantiate the Random Forest classifier
# random_forest = RandomForestClassifier(random_state=42)

# Step 3: Define the hyperparameter grid for Grid Search or Randomized Search
param_grid = {
    'n_estimators': [100, 200, 300],                # Number of trees in the forest
    'max_depth': [None, 10, 20],                     # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],                 # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],                   # Minimum samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2', None],  # Number of features to consider at each split
    # You can add more hyperparameters to the grid based on your requirement
}

# # Step 4: Perform Grid Search
# grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Step 5: Get the best hyperparameters and corresponding model
# best_params_grid = grid_search.best_params_
# best_rf_model_grid = grid_search.best_estimator_

# Step 6: Perform Randomized Search
randomized_search = RandomizedSearchCV(random_forest, param_distributions=param_grid, n_iter=10, cv=5, n_jobs=-1)
randomized_search.fit(X_train, y_train)

# Step 7: Get the best hyperparameters and corresponding model from Randomized Search
best_params_random = randomized_search.best_params_
best_rf_model_random = randomized_search.best_estimator_

# Step 8: Evaluate the best models on the test data
# accuracy_grid = best_rf_model_grid.score(X_dev, y_dev)
accuracy_random = best_rf_model_random.score(X_dev, y_dev)

# print("Best Hyperparameters from Grid Search:", best_params_grid)
# print("Best Model Accuracy from Grid Search:", accuracy_grid)

print("Best Hyperparameters from Randomized Search:", best_params_random)
print("Best Model Accuracy from Randomized Search:", accuracy_random)


5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/home/studio-lab-user/.conda/envs/scipy/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/studio-lab-user/.conda/envs/scipy/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/studio-lab-user/.conda/envs/scipy/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/studio-lab-user/.conda/envs/scipy/lib/python3.11/site-packages/skle

Best Hyperparameters from Randomized Search: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}
Best Model Accuracy from Randomized Search: 0.79700977573318


In [16]:
# inference on the test data (already processed earlier)
y_pred = random_forest.predict(X_test)

# log predictions in the submision file
df_submission =  pd.read_csv(os.path.join(PATH,'sample_submission.csv'))
df_submission['Transported'] = y_pred
df_submission.to_csv('./data/submission_rftuned.csv', index=False)

In [17]:
# !kaggle competitions submit -c spaceship-titanic -f './data/submission_rftuned.csv' -m "random forest parameter tuned"
# !kaggle competitions submissions -c spaceship-titanic

# Save the model

In [18]:
import joblib

In [19]:
# Save the learned parameters and the Random Forest estimator using joblib
joblib.dump(random_forest, './models/rf_tuned.pkl')
# Load the saved Random Forest model
rf_tuned = joblib.load('./models/rf_tuned.pkl')
# Use the loaded model for predictions
y_pred_reload = rf_tuned.predict(X_test)

all(y_pred_reload == y_pred)

True