In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## Reading the data and basic checks

In [29]:
data_path = '../data/'
train_data = pd.read_csv(data_path+'train.csv')
test_data = pd.read_csv(data_path+'test.csv')


In [30]:
print(train_data.shape)
train_data.head()

(8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## EDA

In [31]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [32]:
# checking how many are transported from each planet
pd.crosstab(train_data['Transported'], train_data['HomePlanet'])

HomePlanet,Earth,Europa,Mars
Transported,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,2651,727,839
True,1951,1404,920


In [33]:
# checking which continous groups tend to transport
train_data.groupby(['Transported']).agg({'Age':['mean','median'],'RoomService':['mean'],'FoodCourt':['mean'],\
                                        'ShoppingMall':['mean'],'Spa':['mean'],'VRDeck':['mean']}).reset_index()

Unnamed: 0_level_0,Transported,Age,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,mean,mean,mean,mean
0,False,29.922858,27.0,389.266066,382.61593,167.566217,564.382666,543.629822
1,True,27.748834,26.0,63.098021,532.691984,179.829972,61.675531,69.148131


In [34]:
# checking how many VIPs are transported 
pd.crosstab(train_data['Transported'], train_data['VIP'])

VIP,False,True
Transported,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4093,123
True,4198,76


In [35]:
# checking how many Cryosleep opted people are transported 
pd.crosstab(train_data['Transported'], train_data['CryoSleep'])

CryoSleep,False,True
Transported,Unnamed: 1_level_1,Unnamed: 2_level_1
False,3650,554
True,1789,2483


In [36]:
# checking transported to debraking
pd.crosstab(train_data['Transported'], train_data['Destination'])

Destination,55 Cancri e,PSO J318.5-22,TRAPPIST-1e
Transported,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,702,395,3128
True,1098,401,2787


In [37]:
# checking transported to cabin deck
train_data = pd.concat([train_data,
train_data['Cabin'].str.split('/',expand=True).add_prefix('Cabin_').rename(columns={'Cabin_0':'Cabin_deck','Cabin_1':'Cabin_number','Cabin_2':'Cabin_side'})],axis=1)
pd.crosstab(train_data['Transported'], train_data['Cabin_deck'])

Cabin_deck,A,B,C,D,E,F,G,T
Transported,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,129,207,239,271,563,1565,1238,4
True,127,572,508,207,313,1229,1321,1


In [39]:
pd.crosstab(train_data['Transported'], train_data['Cabin_side'])

Cabin_side,P,S
Transported,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2308,1908
True,1898,2380


## Data processing and Feature engineering

In [108]:
df_train = train_data.copy()
df_train = pd.concat([df_train,df_train['PassengerId'].str.split('_',expand=True).add_prefix('Passenger_').rename(columns={'Passenger_0':'Passenger_grp','Passenger_1':'Passenger_num'})],axis=1)

In [109]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Cabin_deck,Cabin_number,Cabin_side,Passenger_grp,Passenger_num
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,2,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,3,1
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,3,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,4,1


In [110]:
df_train.isna().sum()

PassengerId        0
HomePlanet       201
CryoSleep        217
Cabin            199
Destination      182
Age              179
VIP              203
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
Name             200
Transported        0
Cabin_deck       199
Cabin_number     199
Cabin_side       199
Passenger_grp      0
Passenger_num      0
dtype: int64

In [111]:
cols_fillna_avg = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
cols_fillna_grp_avg = {'HomePlanet':'Passenger_grp','CryoSleep':'Passenger_grp','Cabin_deck':'Passenger_grp','Destination':'Passenger_grp','VIP':'Passenger_grp'}

In [112]:
# Filling NANs of continous variables
for i in cols_fillna_avg:
    df_train[i].fillna(df_train[i].mean(skipna=True),inplace=True)

In [113]:
def fillna_cat_avg(df,fill_col, grp_col):
    temp_df = df.copy()
    temp_df = temp_df[[grp_col,fill_col]]
    temp_df = temp_df[temp_df[fill_col].notnull()]
    temp_df['Most_{}'.format(fill_col)] = (
        temp_df.groupby(grp_col)[fill_col].transform(lambda x: x.value_counts().idxmax()))
    temp_df = temp_df[[grp_col,'Most_{}'.format(fill_col)]].drop_duplicates()
    df = pd.merge(df,temp_df,on=[grp_col],how='left')
    df[fill_col] = np.where(df[fill_col].isnull(),df['Most_{}'.format(fill_col)],df[fill_col])
    df = df[df[fill_col].notnull()]
    df.drop(columns=['Most_{}'.format(fill_col)],inplace=True)
    return df


In [114]:
for key, value in cols_fillna_grp_avg.items():
    df_train = fillna_cat_avg(df_train, key, value)


In [115]:
print(df_train.shape)
df_train.isna().sum()

(8180, 19)


PassengerId        0
HomePlanet         0
CryoSleep          0
Cabin            100
Destination        0
Age                0
VIP                0
RoomService        0
FoodCourt          0
ShoppingMall       0
Spa                0
VRDeck             0
Name             187
Transported        0
Cabin_deck         0
Cabin_number     100
Cabin_side       100
Passenger_grp      0
Passenger_num      0
dtype: int64

In [116]:
feature_list = ['HomePlanet','CryoSleep','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Cabin_deck']
y_var = ['Transported']
df_train = df_train[feature_list+y_var]

In [117]:
df_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Cabin_deck,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,B,False
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,F,True
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,A,False
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,A,False
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,F,True


In [118]:
categorical_cols = ['HomePlanet','Destination','Cabin_deck']
binary_cols = ['CryoSleep','VIP','Transported']

for i in categorical_cols:
    # Get one hot encoding of columns B
    one_hot = pd.get_dummies(df_train[i], drop_first=True)
    # Drop column B as it is now encoded
    df_train = df_train.drop(i,axis = 1)
    # Join the encoded df
    df_train = df_train.join(one_hot)

for i in binary_cols:
    df_train[i] = df_train[i].astype(int)

In [119]:
df_train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Europa,Mars,PSO J318.5-22,TRAPPIST-1e,B,C,D,E,F,G,T
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,1,1,0,0,0,0,0,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,0,0,1,0,0,0,0,1,0,0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,1,0,0,1,0,0,0,0,0,0,0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,1,0,0,1,0,0,0,0,0,0,0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,0,0,0,1,0,0,0,0,1,0,0


## Training Models

In [121]:
df_train.columns

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Transported', 'Europa', 'Mars', 'PSO J318.5-22',
       'TRAPPIST-1e', 'B', 'C', 'D', 'E', 'F', 'G', 'T'],
      dtype='object')

In [122]:
# Split data to train and test
idv = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck','Europa', 'Mars', 'PSO J318.5-22',
       'TRAPPIST-1e', 'B', 'C', 'D', 'E', 'F', 'G', 'T']
dv = ['Transported']
X_train, X_test, y_train, y_test = train_test_split(df_train[idv], df_train[dv], test_size=0.2, random_state=0)

In [141]:
# define standard scaler
scaler = StandardScaler()
# transform data
scaler.fit(X_train)


StandardScaler()

In [139]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

X_rCV = df_train[idv].copy()

# define standard scaler
scaler = StandardScaler()
# transform data
X_rCV =  scaler.fit_transform(X_rCV)

y_rCV = df_train[dv].copy()

clf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = 4)
# Fit the random search model
rf_random.fit(X_rCV, y_rCV)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=4,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [140]:
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

In [148]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 1600,
 min_samples_split=5,
 min_samples_leaf = 1,
 max_features = 'auto',
 max_depth = 10,
 bootstrap = True, random_state=0)
clf.fit(scaler.transform(X_train), y_train)

y_test_pred = clf.predict(scaler.transform(X_test))
print(classification_report(y_test, y_test_pred))
print(pd.crosstab(y_test['Transported'], y_test_pred))

  clf.fit(scaler.transform(X_train), y_train)


              precision    recall  f1-score   support

           0       0.82      0.77      0.79       802
           1       0.79      0.83      0.81       834

    accuracy                           0.80      1636
   macro avg       0.80      0.80      0.80      1636
weighted avg       0.80      0.80      0.80      1636

col_0          0    1
Transported          
0            619  183
1            138  696


In [151]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
print(param_test)

X_rCV = df_train[idv].copy()

# define standard scaler
scaler = StandardScaler()
# transform data
X_rCV =  scaler.fit_transform(X_rCV)

y_rCV = df_train[dv].copy()

clf = lgb.LGBMClassifier()
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = param_test, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs = 4)
# Fit the random search model
rf_random.fit(X_rCV, y_rCV)

{'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f01469e5be0>, 'min_child_samples': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0146b85580>, 'min_child_weight': [1e-05, 0.001, 0.01, 0.1, 1, 10.0, 100.0, 1000.0, 10000.0], 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f01469bd430>, 'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0146ae2bb0>, 'reg_alpha': [0, 0.1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 0.1, 1, 5, 10, 20, 50, 100]}
Fitting 3 folds for each of 200 candidates, totalling 600 fits


  return f(*args, **kwargs)


RandomizedSearchCV(cv=3, estimator=LGBMClassifier(), n_iter=200, n_jobs=4,
                   param_distributions={'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0146ae2bb0>,
                                        'min_child_samples': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f0146b85580>,
                                        'min_child_weight': [1e-05, 0.001, 0.01,
                                                             0.1, 1, 10.0,
                                                             100.0, 1000.0,
                                                             10000.0],
                                        'num_leaves': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f01469e5be0>,
                                        'reg_alpha': [0, 0.1, 1, 2, 5, 7, 10,
                                                      50, 100],
                                        'reg_lambda': [0, 0.1, 1, 5, 10, 20, 50,
        

In [152]:
rf_random.best_params_

{'colsample_bytree': 0.74193381092279,
 'min_child_samples': 109,
 'min_child_weight': 1,
 'num_leaves': 15,
 'reg_alpha': 0,
 'reg_lambda': 20,
 'subsample': 0.8570058437376102}

In [153]:
# build the lightgbm model
import lightgbm as lgb
clf = lgb.LGBMClassifier(colsample_bytree =  0.74193381092279,
 min_child_samples = 109,
 min_child_weight = 1,
 num_leaves = 15,
 reg_alpha = 0,
 reg_lambda = 20,
 subsample = 0.8570058437376102)
clf.fit(scaler.transform(X_train), y_train)

  return f(*args, **kwargs)


LGBMClassifier(colsample_bytree=0.74193381092279, min_child_samples=109,
               min_child_weight=1, num_leaves=15, reg_alpha=0, reg_lambda=20,
               subsample=0.8570058437376102)

In [154]:

y_test_pred = clf.predict(scaler.transform(X_test))
print(classification_report(y_test, y_test_pred))
print(pd.crosstab(y_test['Transported'], y_test_pred))

              precision    recall  f1-score   support

           0       0.82      0.76      0.79       802
           1       0.78      0.84      0.81       834

    accuracy                           0.80      1636
   macro avg       0.80      0.80      0.80      1636
weighted avg       0.80      0.80      0.80      1636

col_0          0    1
Transported          
0            609  193
1            134  700
