In [1]:
try:
    import pandas as pd
    import numpy as np
except (ImportError, ModuleNotFoundError):
    print('Error occurred while importing libraries...')

In [2]:
# Store the path of the kaggle test data
test_df = pd.read_csv('./data/test.csv', header=0)
train_df = pd.read_csv('./data/train.csv', header=0)

In [3]:
print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}\n')
print(f'train_df head:\n{train_df.head()}')
print(f'test_df head:\n{test_df.head()}')

train_df shape: (8693, 14)
test_df shape: (4277, 13)

train_df head:
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1

In [4]:
# drop Name column
train_df.drop('Name', axis=1, inplace=True)
test_df.drop('Name', axis=1, inplace=True)

In [5]:
# convert target values to ones and zeros
train_df['Transported'] = train_df['Transported'].astype(int)

In [6]:
# replace missing values for Cabin
train_df['Cabin']=train_df['Cabin'].fillna(train_df['Cabin'].mode()[0])
test_df['Cabin']=test_df['Cabin'].fillna(test_df['Cabin'].mode()[0])

# replace missing values for HomePlanet
train_df['HomePlanet']=train_df['HomePlanet'].fillna(train_df['HomePlanet'].mode()[0])
test_df['HomePlanet']=test_df['HomePlanet'].fillna(test_df['HomePlanet'].mode()[0])

# replace missing values for CryoSleep
train_df['CryoSleep']=train_df['CryoSleep'].fillna(train_df['CryoSleep'].mode()[0])
test_df['CryoSleep']=test_df['CryoSleep'].fillna(test_df['CryoSleep'].mode()[0])

# replace missing values for Destination
train_df['Destination']=train_df['Destination'].fillna(train_df['Destination'].mode()[0])
test_df['Destination']=test_df['Destination'].fillna(test_df['Destination'].mode()[0])

# replace missing values for Age
train_df['Age']=train_df['Age'].fillna(train_df['Age'].mode()[0])
test_df['Age']=test_df['Age'].fillna(test_df['Age'].mode()[0])

# replace missing values for VIP
train_df['VIP']=train_df['VIP'].fillna(train_df['VIP'].mode()[0])
test_df['VIP']=test_df['VIP'].fillna(test_df['VIP'].mode()[0])

# replace missing values for RoomService
train_df['RoomService']=train_df['RoomService'].fillna(train_df['RoomService'].median())
test_df['RoomService']=test_df['RoomService'].fillna(test_df['RoomService'].median())

# replace missing values for FoodCourt
train_df['FoodCourt']=train_df['FoodCourt'].fillna(train_df['FoodCourt'].median())
test_df['FoodCourt']=test_df['FoodCourt'].fillna(test_df['FoodCourt'].median())

# replace missing values for ShoppingMall
train_df['ShoppingMall']=train_df['ShoppingMall'].fillna(train_df['ShoppingMall'].median())
test_df['ShoppingMall']=test_df['ShoppingMall'].fillna(test_df['ShoppingMall'].median())

# replace missing values for Spa
train_df['Spa']=train_df['Spa'].fillna(train_df['Spa'].median())
test_df['Spa']=test_df['Spa'].fillna(test_df['Spa'].median())

# replace missin values for VRDeck
train_df['VRDeck']=train_df['VRDeck'].fillna(train_df['VRDeck'].median())
test_df['VRDeck']=test_df['VRDeck'].fillna(test_df['VRDeck'].median())

In [7]:
# convert PassengerId to integers
train_df['PassengerId'] = train_df['PassengerId'].apply(lambda x: int(x.replace('_','')))
test_df['PassengerId'] = test_df['PassengerId'].apply(lambda x: int(x.replace('_','')))

# convert CryoSleep to ones and zeros
train_df['CryoSleep'] = train_df['CryoSleep'].map({True:1, False:0})
test_df['CryoSleep'] = test_df['CryoSleep'].map({True:1, False:0})

# convert VIP to ones and zeros
train_df['VIP'] = train_df['VIP'].map({True:1, False:0})
test_df['VIP'] = test_df['VIP'].map({True:1, False:0})

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')

print(f'train_df head:\n{train_df.head()}')
print(f'test_df head:\n{test_df.head()}')

train_df shape: (8693, 13)
test_df shape: (4277, 12)
train_df head:
   PassengerId HomePlanet  CryoSleep  Cabin  Destination   Age  VIP  \
0          101     Europa          0  B/0/P  TRAPPIST-1e  39.0    0   
1          201      Earth          0  F/0/S  TRAPPIST-1e  24.0    0   
2          301     Europa          0  A/0/S  TRAPPIST-1e  58.0    1   
3          302     Europa          0  A/0/S  TRAPPIST-1e  33.0    0   
4          401      Earth          0  F/1/S  TRAPPIST-1e  16.0    0   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck  Transported  
0          0.0        0.0           0.0     0.0     0.0            0  
1        109.0        9.0          25.0   549.0    44.0            1  
2         43.0     3576.0           0.0  6715.0    49.0            0  
3          0.0     1283.0         371.0  3329.0   193.0            0  
4        303.0       70.0         151.0   565.0     2.0            1  
test_df head:
   PassengerId HomePlanet  CryoSleep  Cabin  Destination   Age  V

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['HomePlanet', 'Cabin', 'Destination']
numeric_features = [col for col in train_df.columns if col not in categorical_features + ['Transported']]

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X = train_df.drop(columns=['Transported'])
y = train_df['Transported']

X = preprocessor.fit_transform(X)
print(type(X))
onehot_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(input_features=categorical_features)

all_feature_names = numeric_features + list(onehot_columns)

# X was returned as a sparse matrix, so we convert it
X = X.toarray()

X = pd.DataFrame(X, columns=all_feature_names)

Y = preprocessor.transform(test_df)

# Y was also returned as a sparse matrix, so we convert it
Y = Y.toarray()

Y = pd.DataFrame(Y, columns=all_feature_names)

print(X.head())
print()
print(Y.head())

<class 'scipy.sparse._csr.csr_matrix'>
   PassengerId  CryoSleep       Age       VIP  RoomService  FoodCourt  \
0    -1.734411   -0.73277  0.715553 -0.153063    -0.333105  -0.281027   
1    -1.734036   -0.73277 -0.329408 -0.153063    -0.168073  -0.275387   
2    -1.733662   -0.73277  2.039169  6.533255    -0.268001   1.959998   
3    -1.733658   -0.73277  0.297569 -0.153063    -0.333105   0.523010   
4    -1.733287   -0.73277 -0.886720 -0.153063     0.125652  -0.237159   

   ShoppingMall       Spa    VRDeck  HomePlanet_Earth  ...  Cabin_G/999/P  \
0     -0.283579 -0.270626 -0.263003               0.0  ...            0.0   
1     -0.241771  0.217158 -0.224205               1.0  ...            0.0   
2     -0.283579  5.695623 -0.219796               0.0  ...            0.0   
3      0.336851  2.687176 -0.092818               0.0  ...            0.0   
4     -0.031059  0.231374 -0.261240               1.0  ...            0.0   

   Cabin_G/999/S  Cabin_T/0/P  Cabin_T/1/P  Cabin_T/2/P  Ca

In [9]:
# sanity check to make sure there are no missing values
print(pd.isna(train_df).sum())
print()
print(pd.isna(test_df).sum())

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64


In [10]:
from sklearn.model_selection import train_test_split
# We're creating train/test sets from the training data set provided by Kaggle
# The test set provided by Kaggle doesn't have the target features
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123, test_size = 0.2, stratify = y)

In [11]:
from sklearn.feature_selection import VarianceThreshold

# Remove features with a variance of less than 0.1
# We need to do this, because one-hot encoding created an enormous
# number of features.
sel = VarianceThreshold(threshold=(0.1))
sel.fit(X_train)
X_train_vt = sel.transform(X_train)
X_test_vt = sel.transform(X_test)
Y_vt = sel.transform(Y)

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

# grid-search with 10-fold cross-validation to find best depth and loss criterion for the Decision Tree
param_grid = {  'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20],
                'criterion': ['gini', 'entropy', 'log_loss']
                }

base_estimator = DecisionTreeClassifier(random_state=123)
grid_search_vt = GridSearchCV(estimator=base_estimator, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)

Best hyperparameters with variance threshold:  {'criterion': 'entropy', 'max_depth': 8}
Best score:  0.7758093525179856


In [13]:
# set the tree hyperparameters to the values found from grid-search
base_estimator.set_params(max_depth=8,criterion='entropy')

# perform grid-search to find the best number of estimators for the bagging classifier
param_grid = { 'n_estimators': [10, 15, 20, 25, 30] }

bagging_classifier = BaggingClassifier(random_state = 123, base_estimator=base_estimator)
grid_search_vt = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)



Best hyperparameters with variance threshold:  {'n_estimators': 25}
Best score:  0.7883223765814934


In [14]:
# set the bagging classifier n_estimators to best value from grid-search and train/test
# the model on the training and test sets derived from the Kaggle training data
bagging_classifier.set_params(n_estimators=25)
bagging_classifier.fit(X_train_vt, y_train)
y_pred = bagging_classifier.predict(X_test_vt)
print(f'accuracy_score: {accuracy_score(y_pred=y_pred, y_true=y_test)}')



accuracy_score: 0.780333525014376


In [15]:
from sklearn.neighbors import KNeighborsClassifier

# perform grid-search to find best number of neighbors, weighting, and distance metric
# for knn classifier
param_grid = { 'n_neighbors': [5, 6, 7, 8, 9, 10, 15, 20],
               'weights': ['uniform', 'distance'],
               'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'],
                }

base_estimator = KNeighborsClassifier()
grid_search_vt = GridSearchCV(estimator=base_estimator, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)


Best hyperparameters with variance threshold:  {'metric': 'cityblock', 'n_neighbors': 20, 'weights': 'uniform'}
Best score:  0.7834360787232283


In [16]:
# set the knn hyperparameters to best values found from grid-search
base_estimator.set_params(metric='cityblock', n_neighbors=20, weights='uniform')

# perform grid-search to find best number of estimators for bagging classifier
param_grid = { 'n_estimators': [10, 15, 20, 25, 30] }

bagging_classifier = BaggingClassifier(random_state = 123, base_estimator=base_estimator)
grid_search_vt = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)



Best hyperparameters with variance threshold:  {'n_estimators': 30}
Best score:  0.7848743074505913


In [17]:
# set the bagging classifier n_estimators to best value from grid-search and train/test
# the model on the training and test sets derived from the Kaggle training data
bagging_classifier.set_params(n_estimators=30)
bagging_classifier.fit(X_train_vt, y_train)
y_pred = bagging_classifier.predict(X_test_vt)
print(f'accuracy_score: {accuracy_score(y_pred=y_pred, y_true=y_test)}')



accuracy_score: 0.7705577918343876


In [18]:
from sklearn.ensemble import HistGradientBoostingClassifier

# perform grid-search to find best learning rate and maximum leaf nodes for Gradient Boosting
# we don't use the values from the original gradient boosting Jupyter notebook, because
# we preprocessed the data slightly different this time by filling in missing values
param_grid = { 'learning_rate': [0.01, 0.1, 1, 10],
               'max_leaf_nodes': [10, 15, 20, 25, 30, 31, 35, 40, 45]
                }

gb_classifier_vt = HistGradientBoostingClassifier(random_state=123)
grid_search_vt = GridSearchCV(estimator=gb_classifier_vt, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)

Best hyperparameters with variance threshold:  {'learning_rate': 0.1, 'max_leaf_nodes': 10}
Best score:  0.7937941371041097


In [19]:
# Set the gradient boosting hyperparameters to the best values from grid-search
base_estimator = HistGradientBoostingClassifier(random_state=123, learning_rate=0.1, max_leaf_nodes=10)

# perform grid-search to find the best number of estimators for the bagging classifier
param_grid = { 'n_estimators': [10, 15, 20, 25, 30] }

bagging_classifier = BaggingClassifier(random_state = 123, base_estimator=base_estimator)
grid_search_vt = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)



Best hyperparameters with variance threshold:  {'n_estimators': 30}
Best score:  0.7959501364425701


In [70]:
# set the bagging classifier n_estimators to best value from grid-search and train/test
# the model on the training and test sets derived from the Kaggle training data
bagging_classifier.set_params(n_estimators=30)
bagging_classifier.fit(X_train_vt, y_train)
y_pred = bagging_classifier.predict(X_test_vt)
print(f'accuracy_score: {accuracy_score(y_pred=y_pred, y_true=y_test)}')



accuracy_score: 0.8004600345025877


In [78]:
from sklearn.svm import SVC

# instantiate the SVC base estimator with same 'C' value found in the svc Jupyter notebook.
# The data was preprocessed the same way, so we can do this
base_estimator = SVC(random_state=123, C=10)

# perform grid-search to find the best number of estimators for the bagging classifier
param_grid = { 'n_estimators': [10, 15, 20, 25, 30] }

bagging_classifier = BaggingClassifier(random_state = 123, base_estimator=base_estimator)
grid_search_vt = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)



Best hyperparameters with variance threshold:  {'n_estimators': 25}
Best score:  0.7917758207227321


In [79]:
# set the bagging classifier n_estimators to best value from grid-search and train/test
# the model on the training and test sets derived from the Kaggle training data
bagging_classifier.set_params(n_estimators=15)
bagging_classifier.fit(X_train_vt, y_train)
y_pred = bagging_classifier.predict(X_test_vt)
print(f'accuracy_score: {accuracy_score(y_pred=y_pred, y_true=y_test)}')

# This is the las model we will train for this project as we have run out of time.
# Since none of these models beat the accuracy score in the gradient_boost Jupyter notebook,
# we will not transfer any of the predictions to a csv for submission to Kaggle this time.



accuracy_score: 0.7866589994249569
