In [2]:
try:
    import pandas as pd
    import numpy as np
except (ImportError, ModuleNotFoundError):
    print('Error occurred while importing libraries...')

In [3]:
# Store the path of the kaggle test data
test_df = pd.read_csv('./data/test.csv', header=0)
train_df = pd.read_csv('./data/train.csv', header=0)

In [4]:
print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}\n')
print(f'train_df head:\n{train_df.head()}')
print(f'test_df head:\n{test_df.head()}')

train_df shape: (8693, 14)
test_df shape: (4277, 13)

train_df head:
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1

In [5]:
# drop Name column
train_df.drop('Name', axis=1, inplace=True)
test_df.drop('Name', axis=1, inplace=True)

# drop Cabin
train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)

In [6]:
# convert target values to ones and zeros
train_df['Transported'] = train_df['Transported'].astype(int)

In [7]:
# replace missing values for HomePlanet
train_df['HomePlanet']=train_df['HomePlanet'].fillna(train_df['HomePlanet'].mode()[0])
test_df['HomePlanet']=test_df['HomePlanet'].fillna(test_df['HomePlanet'].mode()[0])

# replace missing values for CryoSleep
train_df['CryoSleep']=train_df['CryoSleep'].fillna(train_df['CryoSleep'].mode()[0])
test_df['CryoSleep']=test_df['CryoSleep'].fillna(test_df['CryoSleep'].mode()[0])

# replace missing values for Destination
train_df['Destination']=train_df['Destination'].fillna(train_df['Destination'].mode()[0])
test_df['Destination']=test_df['Destination'].fillna(test_df['Destination'].mode()[0])

# replace missing values for Age
train_df['Age']=train_df['Age'].fillna(train_df['CryoSleep'].mode()[0])
test_df['Age']=test_df['Age'].fillna(test_df['CryoSleep'].mode()[0])

# replace missing values for VIP
train_df['VIP']=train_df['VIP'].fillna(train_df['VIP'].mode()[0])
test_df['VIP']=test_df['VIP'].fillna(test_df['VIP'].mode()[0])

# replace missing values for RoomService
train_df['RoomService']=train_df['RoomService'].fillna(train_df['RoomService'].mean())
test_df['RoomService']=test_df['RoomService'].fillna(test_df['RoomService'].mean())

# replace missing values for FoodCourt
train_df['FoodCourt']=train_df['FoodCourt'].fillna(train_df['FoodCourt'].mean())
test_df['FoodCourt']=test_df['FoodCourt'].fillna(test_df['FoodCourt'].mean())

# replace missing values for ShoppingMall
train_df['ShoppingMall']=train_df['ShoppingMall'].fillna(train_df['ShoppingMall'].mode()[0])
test_df['ShoppingMall']=test_df['ShoppingMall'].fillna(test_df['ShoppingMall'].mode()[0])

# replace missing values for Spa
train_df['Spa']=train_df['Spa'].fillna(train_df['Spa'].mean())
test_df['Spa']=test_df['Spa'].fillna(test_df['Spa'].mean())

# replace missin values for VRDeck
train_df['VRDeck']=train_df['VRDeck'].fillna(train_df['VRDeck'].mode()[0])
test_df['VRDeck']=test_df['VRDeck'].fillna(test_df['VRDeck'].mode()[0])

In [8]:
# drop PassengerId
train_df.drop(columns=['PassengerId'])
test_df.drop(columns=['PassengerId'])

# convert CryoSleep to ones and zeros
train_df['CryoSleep'] = train_df['CryoSleep'].map({True:1, False:0})
test_df['CryoSleep'] = test_df['CryoSleep'].map({True:1, False:0})

# convert VIP to ones and zeros
train_df['VIP'] = train_df['VIP'].map({True:1, False:0})
test_df['VIP'] = test_df['VIP'].map({True:1, False:0})

print(f'train_df shape: {train_df.shape}')
print(f'test_df shape: {test_df.shape}')

print(f'train_df head:\n{train_df.head()}')
print(f'test_df head:\n{test_df.head()}')

train_df shape: (8693, 12)
test_df shape: (4277, 11)
train_df head:
  PassengerId HomePlanet  CryoSleep  Destination   Age  VIP  RoomService  \
0     0001_01     Europa          0  TRAPPIST-1e  39.0    0          0.0   
1     0002_01      Earth          0  TRAPPIST-1e  24.0    0        109.0   
2     0003_01     Europa          0  TRAPPIST-1e  58.0    1         43.0   
3     0003_02     Europa          0  TRAPPIST-1e  33.0    0          0.0   
4     0004_01      Earth          0  TRAPPIST-1e  16.0    0        303.0   

   FoodCourt  ShoppingMall     Spa  VRDeck  Transported  
0        0.0           0.0     0.0     0.0            0  
1        9.0          25.0   549.0    44.0            1  
2     3576.0           0.0  6715.0    49.0            0  
3     1283.0         371.0  3329.0   193.0            0  
4       70.0         151.0   565.0     2.0            1  
test_df head:
  PassengerId HomePlanet  CryoSleep  Destination   Age  VIP  RoomService  \
0     0013_01      Earth          1  

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['HomePlanet', 'Destination']
numeric_features = [col for col in train_df.columns if col not in categorical_features + ['Transported']]

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X = train_df.drop(columns=['Transported'])
y = train_df['Transported']

X = preprocessor.fit_transform(X)

onehot_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(input_features=categorical_features)

all_feature_names = numeric_features + list(onehot_columns)

#X = X.toarray()

X = pd.DataFrame(X, columns=all_feature_names)

#Y = test_df
Y = preprocessor.transform(test_df)

#Y = Y.toarray()
Y = pd.DataFrame(Y, columns=all_feature_names)
print(X.head())
print()
print(Y.head())

   PassengerId  CryoSleep       Age       VIP  RoomService  FoodCourt  \
0    -1.734411   -0.73277  0.721984 -0.153063    -0.340590  -0.287314   
1    -1.734036   -0.73277 -0.283969 -0.153063    -0.175364  -0.281669   
2    -1.733662   -0.73277  1.996191  6.533255    -0.275409   1.955616   
3    -1.733658   -0.73277  0.319603 -0.153063    -0.340590   0.517406   
4    -1.733287   -0.73277 -0.820477 -0.153063     0.118709  -0.243409   

   ShoppingMall       Spa    VRDeck  HomePlanet_Earth  HomePlanet_Europa  \
0     -0.283579 -0.276663 -0.263003               0.0                1.0   
1     -0.241771  0.211505 -0.224205               1.0                0.0   
2     -0.283579  5.694289 -0.219796               0.0                1.0   
3      0.336851  2.683471 -0.092818               0.0                1.0   
4     -0.031059  0.225732 -0.261240               1.0                0.0   

   HomePlanet_Mars  Destination_55 Cancri e  Destination_PSO J318.5-22  \
0              0.0            

In [10]:
print(pd.isna(train_df).sum())
print()
print(pd.isna(test_df).sum())

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123, test_size = 0.2, stratify = y)

In [12]:
from sklearn.feature_selection import VarianceThreshold

# Variance Threshold Section
sel = VarianceThreshold(threshold=(0.1))
sel.fit(X_train)
X_train_vt = sel.transform(X_train)
X_test_vt = sel.transform(X_test)
Y_vt = sel.transform(Y)

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {  'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20],
                'criterion': ['gini', 'entropy', 'log_loss']
                }

base_estimator = DecisionTreeClassifier(random_state=123)
grid_search_vt = GridSearchCV(estimator=base_estimator, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)

Best hyperparameters with variance threshold:  {'criterion': 'entropy', 'max_depth': 6}
Best score:  0.7766780368808401


In [14]:
base_estimator.set_params(max_depth=6,criterion='entropy')
param_grid = { 'n_estimators': [10, 15, 20, 25, 30] }

bagging_classifier = BaggingClassifier(random_state = 123, base_estimator=base_estimator)
grid_search_vt = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)



Best hyperparameters with variance threshold:  {'n_estimators': 25}
Best score:  0.786886628628132


In [15]:
bagging_classifier.set_params(n_estimators=25)
bagging_classifier.fit(X_train_vt, y_train)
y_pred = bagging_classifier.predict(X_test_vt)
print(f'accuracy_score: {accuracy_score(y_pred=y_pred, y_true=y_test)}')



accuracy_score: 0.7763082231167338


In [20]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = { 'n_neighbors': [5, 6, 7, 8, 9, 10, 15, 20],
               'weights': ['uniform', 'distance'],
               'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'],
                }

base_estimator = KNeighborsClassifier()
grid_search_vt = GridSearchCV(estimator=base_estimator, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)


Best hyperparameters with variance threshold:  {'metric': 'cityblock', 'n_neighbors': 20, 'weights': 'uniform'}
Best score:  0.7801296204415779


In [22]:
base_estimator.set_params(metric='cityblock', n_neighbors=20, weights='uniform')
param_grid = { 'n_estimators': [10, 15, 20, 25, 30] }

bagging_classifier = BaggingClassifier(random_state = 123, base_estimator=base_estimator)
grid_search_vt = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)

grid_search_vt.fit(X_train_vt, y_train)
print("Best hyperparameters with variance threshold: ", grid_search_vt.best_params_)
print('Best score: ', grid_search_vt.best_score_)



Best hyperparameters with variance threshold:  {'n_estimators': 30}
Best score:  0.7840118250227405
