# Data Processing

In [1]:
import pandas as pd
import numpy as np

# laoding the data
data = pd.read_csv("./data/cleaned/cleaned_data.csv")

In [2]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
X = data.drop(["Transported", "Name"], axis=1)
y = data["Transported"].astype(np.int8)

In [4]:
X["PassengerId"].describe()

count        8693
unique       8693
top       2809_01
freq            1
Name: PassengerId, dtype: object

In [5]:
X.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [6]:
passenger_id = X["PassengerId"]

X["PassengerGroup"] = passenger_id.apply(lambda x: int(x[:4]))

X = X.drop("PassengerId", axis=1)

In [7]:
X.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGroup
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,1
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,2
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,3
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,3
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,4


In [8]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# identifying the categorical and numeric features
numeric_features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "PassengerGroup"]
categorical_features = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP"]

# transformer to scale the numeric features
numeric_transformer = StandardScaler()

# encoder for the categorical features
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

# a colum transformer to apply the transformers to the categorical and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# combining the proprocessing transformers with the classifier
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression())
    ]
)


In [9]:
from sklearn.model_selection import train_test_split

# creating the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
# training the model
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'RoomService',
                                                   'FoodCourt', 'ShoppingMall',
                                                   'Spa', 'VRDeck',
                                                   'PassengerGroup']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  ['HomePlanet', 'CryoSleep',
                                                   'Cabin', 'Destination',
                                                   'VIP'])])),
                ('scaler', StandardScaler()),
                ('classifier', LogisticRegression())])

In [11]:
print("Score on train: ", model.score(X_train, y_train))
print("Score on test: ", model.score(X_test, y_test))

Score on train:  0.7798389416163359
Score on test:  0.7745830937320299


In [15]:
from sklearn.model_selection import GridSearchCV

parameter_grid = {
    "classifier__C": [0.001, 0.1, 1.0, 10, 100, 120],
    "classifier__tol": [1e-3, 1e-4,1e-5],
}

grid_search = GridSearchCV(model, param_grid=parameter_grid, cv=5)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['Age',
                                                                          'RoomService',
                                                                          'FoodCourt',
                                                                          'ShoppingMall',
                                                                          'Spa',
                                                                          'VRDeck',
                                                                          'PassengerGroup']),
                                                                        ('cat',
                                                                    

In [16]:
grid_search.best_params_

{'classifier__C': 0.1, 'classifier__tol': 0.001}

In [17]:
print("Grid Score on Train: ", grid_search.score(X_train, y_train))
print("Grid Score on Test: ", grid_search.score(X_test, y_test))


Grid Score on Train:  0.7795513373597929
Grid Score on Test:  0.7745830937320299
