# Data Processing

In [1]:
import pandas as pd
import numpy as np

# laoding the data
data = pd.read_csv("./data/cleaned/cleaned_data.csv")

In [2]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
X = data.drop(["Transported", "Name"], axis=1)
y = data["Transported"].astype(np.int8)

In [4]:
X["PassengerId"].describe()

count        8693
unique       8693
top       5540_01
freq            1
Name: PassengerId, dtype: object

In [5]:
X.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [6]:
passenger_id = X["PassengerId"]

X["PassengerGroup"] = passenger_id.apply(lambda x: int(x[:4]))

X = X.drop("PassengerId", axis=1)

In [7]:
X.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGroup
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,1
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,2
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,3
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,3
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,4


In [32]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

# identifying the categorical and numeric features
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "PassengerGroup"]
categorical_features = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP"]

# transformer to scale the numeric features
numeric_transformer = StandardScaler()

# encoder for the categorical features
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)

# a colum transformer to apply the transformers to the categorical and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# combining the proprocessing transformers with the classifier
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("classifier", GradientBoostingClassifier()),
    ]
)


In [33]:
from sklearn.model_selection import train_test_split

# creating the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [36]:
# training the model
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'RoomService',
                                                   'FoodCourt', 'ShoppingMall',
                                                   'Spa', 'VRDeck',
                                                   'PassengerGroup']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-999),
                                                  ['HomePlanet', 'CryoSleep',
                                                   'Cabin', 'Destination',
                                                   'VIP'])])),
                ('scaler', StandardScaler()),
                ('classifier', GradientBoostingClassifier())])

In [37]:
print("Score on train: ", model.score(X_train, y_train))
print("Score on test: ", model.score(X_test, y_test))

Score on train:  0.82053494391717
Score on test:  0.7866589994249569


In [42]:
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [2, 3, 4, 5, 6],
}

grid_search = GridSearchCV(model, param_grid=parameter_grid, cv=5, error_score="raise")

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['Age',
                                                                          'RoomService',
                                                                          'FoodCourt',
                                                                          'ShoppingMall',
                                                                          'Spa',
                                                                          'VRDeck',
                                                                          'PassengerGroup']),
                                                                        ('cat',
                                               

In [43]:
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.586099,0.007451,0.013704,0.001793,2,100,"{'classifier__max_depth': 2, 'classifier__n_es...",0.800863,0.78936,0.778577,0.759885,0.788489,0.783435,0.01373,5
1,1.119267,0.031748,0.016067,0.003732,2,200,"{'classifier__max_depth': 2, 'classifier__n_es...",0.803738,0.798706,0.78289,0.767793,0.776978,0.786021,0.013406,3
2,0.813381,0.002932,0.0145,0.000454,3,100,"{'classifier__max_depth': 3, 'classifier__n_es...",0.805176,0.794393,0.787203,0.767074,0.778417,0.786453,0.013071,1
3,1.568299,0.031197,0.014621,0.000449,3,200,"{'classifier__max_depth': 3, 'classifier__n_es...",0.806614,0.792236,0.777858,0.78289,0.769065,0.785732,0.012855,4
4,1.03264,0.007971,0.013562,0.000487,4,100,"{'classifier__max_depth': 4, 'classifier__n_es...",0.80949,0.797987,0.79583,0.774982,0.731655,0.781989,0.027519,6
5,2.048494,0.025278,0.015759,0.001162,4,200,"{'classifier__max_depth': 4, 'classifier__n_es...",0.800863,0.778577,0.784328,0.783609,0.721583,0.773792,0.027162,9
6,1.283775,0.021266,0.015171,0.000743,5,100,"{'classifier__max_depth': 5, 'classifier__n_es...",0.80949,0.794393,0.786485,0.781452,0.758993,0.786162,0.016567,2
7,2.550376,0.012599,0.016349,0.000495,5,200,"{'classifier__max_depth': 5, 'classifier__n_es...",0.793674,0.787922,0.780733,0.780014,0.74964,0.778397,0.015226,7
8,1.550276,0.011706,0.015554,0.000495,6,100,"{'classifier__max_depth': 6, 'classifier__n_es...",0.767793,0.791517,0.780014,0.768512,0.763309,0.774229,0.010254,8
9,3.117462,0.027739,0.017954,0.00089,6,200,"{'classifier__max_depth': 6, 'classifier__n_es...",0.762042,0.792955,0.759166,0.770669,0.76259,0.769484,0.012342,10


In [44]:
grid_search.best_params_

{'classifier__max_depth': 3, 'classifier__n_estimators': 100}

In [45]:
print("Grid Score on Train: ", grid_search.score(X_train, y_train))
print("Grid Score on Test: ", grid_search.score(X_test, y_test))


Grid Score on Train:  0.82053494391717
Grid Score on Test:  0.7860839562967222


# Saving the final model with the parameters from the grid search

In [46]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# identifying the categorical and numeric features
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "PassengerGroup"]
categorical_features = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP"]

# transformer to scale the numeric features
numeric_transformer = StandardScaler()

# encoder for the categorical features
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

# a colum transformer to apply the transformers to the categorical and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# combining the proprocessing transformers with the classifier
final_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("classifier", SVC(kernel="linear", C=10,  probability=True)),
    ]
)


In [47]:
final_model.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'RoomService',
                                                   'FoodCourt', 'ShoppingMall',
                                                   'Spa', 'VRDeck',
                                                   'PassengerGroup']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['HomePlanet', 'CryoSleep',
                                                   'Cabin', 'Destination',
                                                   'VIP'])])),
                ('scaler', StandardScaler()),
                ('classifier', SVC(C=10, kernel='linear', probability=True))])

In [51]:
import pickle
import os

# saving the current model
os.makedirs("saved_models", exist_ok=True)

with open(os.path.join("saved_models", "model_1.pkl"), "wb") as f:
    pickle.dump(final_model, f)