# Data Processing

In [1]:
import pandas as pd
import numpy as np

# laoding the data
data = pd.read_csv("./data/cleaned/cleaned_data.csv")

In [2]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
X = data.drop(["Transported", "Name"], axis=1)
y = data["Transported"].astype(np.int8)

In [4]:
X["PassengerId"].describe()

count        8693
unique       8693
top       6494_01
freq            1
Name: PassengerId, dtype: object

In [5]:
X.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [6]:
passenger_id = X["PassengerId"]

X["PassengerGroup"] = passenger_id.apply(lambda x: int(x[:4]))

X = X.drop("PassengerId", axis=1)

In [7]:
X.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGroup
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,1
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,2
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,3
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,3
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,4


In [281]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC

# identifying the categorical and numeric features
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "PassengerGroup"]
categorical_features = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP"]

# transformer to scale the numeric features
numeric_transformer = StandardScaler()

# encoder for the categorical features
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)

# a colum transformer to apply the transformers to the categorical and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# combining the proprocessing transformers with the classifier
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("classifier", GradientBoostingClassifier(max_depth=3, max_features=3)),
    ]
)


In [294]:
from sklearn.model_selection import train_test_split

# creating the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [295]:
# training the model
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'RoomService',
                                                   'FoodCourt', 'ShoppingMall',
                                                   'Spa', 'VRDeck',
                                                   'PassengerGroup']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-999),
                                                  ['HomePlanet', 'CryoSleep',
                                                   'Cabin', 'Destination',
                                                   'VIP'])])),
                ('scaler', StandardScaler()),
                ('classifier', GradientBoostingClassifier(max_features=3))])

In [296]:
print("Score on train: ", model.score(X_train, y_train))
print("Score on test: ", model.score(X_test, y_test))

Score on train:  0.8088869715271786
Score on test:  0.7998849913743531


In [297]:
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    "classifier__max_depth": [1, 2, 3, 4],
    "classifier__max_features": [1, 2, 3, 4],
}

grid_search = GridSearchCV(model, param_grid=parameter_grid, cv=5, error_score="raise")

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['Age',
                                                                          'RoomService',
                                                                          'FoodCourt',
                                                                          'ShoppingMall',
                                                                          'Spa',
                                                                          'VRDeck',
                                                                          'PassengerGroup']),
                                                                        ('cat',
                                               

In [298]:
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,param_classifier__max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.14282,0.011434,0.013363,0.0015,1,1,"{'classifier__max_depth': 1, 'classifier__max_...",0.772106,0.755572,0.783609,0.76348,0.772662,0.769486,0.009446,16
1,0.155386,0.023595,0.012775,0.002214,1,2,"{'classifier__max_depth': 1, 'classifier__max_...",0.774263,0.779295,0.791517,0.781452,0.77554,0.780413,0.006119,12
2,0.15698,0.002063,0.012179,0.000731,1,3,"{'classifier__max_depth': 1, 'classifier__max_...",0.774263,0.774263,0.792236,0.772825,0.774101,0.777538,0.007369,14
3,0.185702,0.007776,0.012763,0.000399,1,4,"{'classifier__max_depth': 1, 'classifier__max_...",0.773544,0.766355,0.786485,0.767074,0.77482,0.773656,0.007247,15
4,0.174494,0.005008,0.013786,0.00039,2,1,"{'classifier__max_depth': 2, 'classifier__max_...",0.779295,0.769231,0.800863,0.781452,0.778417,0.781852,0.010385,8
5,0.21164,0.001475,0.013763,0.000398,2,2,"{'classifier__max_depth': 2, 'classifier__max_...",0.785047,0.774263,0.793674,0.782171,0.77554,0.782139,0.00703,7
6,0.263097,0.002783,0.014582,0.000474,2,3,"{'classifier__max_depth': 2, 'classifier__max_...",0.777858,0.778577,0.791517,0.779295,0.771942,0.779838,0.006396,13
7,0.319745,0.03104,0.014959,0.002603,2,4,"{'classifier__max_depth': 2, 'classifier__max_...",0.778577,0.775701,0.79583,0.774263,0.777698,0.780414,0.007854,11
8,0.204865,0.004526,0.013166,0.000398,3,1,"{'classifier__max_depth': 3, 'classifier__max_...",0.782171,0.762042,0.802301,0.781452,0.779856,0.781564,0.012763,9
9,0.319353,0.068148,0.016154,0.004157,3,2,"{'classifier__max_depth': 3, 'classifier__max_...",0.784328,0.783609,0.802301,0.792955,0.78705,0.790048,0.006953,1


In [299]:
grid_search.best_params_

{'classifier__max_depth': 3, 'classifier__max_features': 2}

In [300]:
print("Grid Score on Train: ", grid_search.score(X_train, y_train))
print("Grid Score on Test: ", grid_search.score(X_test, y_test))


Grid Score on Train:  0.8091745757837217
Grid Score on Test:  0.7998849913743531


# Saving the final model with the parameters from the grid search

In [301]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC

# identifying the categorical and numeric features
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "PassengerGroup"]
categorical_features = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP"]

# transformer to scale the numeric features
numeric_transformer = StandardScaler()

# encoder for the categorical features
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)

# a colum transformer to apply the transformers to the categorical and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# combining the proprocessing transformers with the classifier
final_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("classifier", GradientBoostingClassifier(max_depth=3, max_features=3)),
    ]
)


In [302]:
final_model.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'RoomService',
                                                   'FoodCourt', 'ShoppingMall',
                                                   'Spa', 'VRDeck',
                                                   'PassengerGroup']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-999),
                                                  ['HomePlanet', 'CryoSleep',
                                                   'Cabin', 'Destination',
                                                   'VIP'])])),
                ('scaler', StandardScaler()),
                ('classifier', GradientBoostingClassifier(max_features=3))])

In [303]:
final_model.score(X_test, y_test)

0.8257619321449109

In [304]:
import pickle
import os

# saving the current model
os.makedirs("saved_models", exist_ok=True)

with open(os.path.join("saved_models", "model_1.pkl"), "wb") as f:
    pickle.dump(final_model, f)