This notebook shows how to create an ensemble of five models. I used Random Forest, XGBoost, LightGBM, Catboost and a fastai based tabular neural network. 

Separately, the Kaggle scores for each model were: Random Forest at 0.80406, XGBoost at 0.81529, LightGBM at 0.8043, Catboost at 0.80336 and the neural network at 0.8057. Together in this ensemble, the Kaggle score was 0.80897. It appears the higher score of the XGBoost model pulled the average up, but the lower scores of the other models pulled the higher XGBoost score down. 

In [1]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestClassifier

import lightgbm as lgbm
import catboost as cb

from fastai.imports import *
from fastai.tabular.all import *

## **##### 2.Loading datasets**

In [2]:
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv')

In [3]:
train_test = pd.concat([train, test], ignore_index=True)

In [4]:
Expenses_columns = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
train_test['Expenses'] = train_test.loc[:,Expenses_columns].sum(axis=1)
train_test.loc[:,['CryoSleep']]=train_test.apply(lambda x: True if x.Expenses == 0 and pd.isna(x.CryoSleep) else x,axis =1)
train_test.loc[:,['Group']] = train_test.PassengerId.apply(lambda x: x[0:4] )
train_test[['Deck', 'Number', 'Side']] = train_test['Cabin'].str.split('/', expand=True)
train_test.loc[:,['FirstName']] = train_test.Name.str.split(" ",expand=True).iloc[:,0]
train_test.loc[:,['SecondName']] = train_test.Name.str.split(" ",expand=True).iloc[:,1]
train_test['Name_key']=train_test['SecondName']+train_test['Group']
num_cols = ['ShoppingMall','FoodCourt','RoomService','Spa','VRDeck','Expenses','Age']
cat_cols = ['CryoSleep','Deck','Side','VIP','HomePlanet','Destination', ]
transported=['Transported']

In [5]:
train_test = train_test[num_cols+cat_cols+transported].copy()

In [6]:
num_imp = SimpleImputer(strategy='mean')
cat_imp = SimpleImputer(strategy='most_frequent')
train_test[num_cols] = pd.DataFrame(num_imp.fit_transform(train_test[num_cols]),columns=num_cols)
train_test[cat_cols] = pd.DataFrame(cat_imp.fit_transform(train_test[cat_cols]),columns=cat_cols)
ohe = OneHotEncoder (handle_unknown='ignore',sparse = False)
temp_train = pd.DataFrame(ohe.fit_transform(train_test[cat_cols]),columns=ohe.get_feature_names_out())
train_test = train_test.drop(cat_cols,axis=1)
train_test = pd.concat([train_test,temp_train],axis=1)

In [7]:
train_test

Unnamed: 0,ShoppingMall,FoodCourt,RoomService,Spa,VRDeck,Expenses,Age,Transported,CryoSleep_False,CryoSleep_True,...,Side_P,Side_S,VIP_False,VIP_True,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0.0,0.0,0.0,0.0,0.0,0.0,39.000000,False,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,25.0,9.0,109.0,549.0,44.0,736.0,24.000000,True,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,3576.0,43.0,6715.0,49.0,10383.0,58.000000,False,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,371.0,1283.0,0.0,3329.0,193.0,5176.0,33.000000,False,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,151.0,70.0,303.0,565.0,2.0,1091.0,16.000000,True,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,0.0,0.0,0.0,0.0,0.0,0.0,34.000000,,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
12966,17.0,847.0,0.0,10.0,144.0,1018.0,42.000000,,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
12967,0.0,0.0,0.0,0.0,0.0,0.0,28.771969,,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
12968,0.0,2680.0,0.0,0.0,523.0,3203.0,28.771969,,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [8]:
train = train_test[train_test['Transported'].notnull()].copy()
train.Transported =train.Transported.astype('int')
test = train_test[train_test['Transported'].isnull()].drop("Transported",axis=1)
X = train.drop('Transported',axis=1)
y = train.Transported
drop_list=['ShoppingMall','Age','CryoSleep_True','HomePlanet_Earth','HomePlanet_Europa',
'VIP_True','HomePlanet_Mars','Destination_PSO J318.5-22','VIP_False',
'Destination_55 Cancri e','FoodCourt','Destination_TRAPPIST-1e']
X=X.drop(drop_list,axis=1)
test=test.drop(drop_list,axis=1)

In [9]:
X,y = shuffle(X,y, random_state=42)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [10]:
def get_score(model,X,y):
    n = cross_val_score(model,X,y,scoring ='accuracy',cv=20)
    return n

In [11]:
params_xgb_best= {'lambda': 3.0610042624477543, 
             'alpha': 4.581902571574289, 
             'colsample_bytree': 0.9241969052729379, 
             'subsample': 0.9527591724824661, 
             'learning_rate': 0.06672065863100594, 
             'n_estimators': 730,
             'max_depth': 5, 
             'min_child_weight': 1, 
             'num_parallel_tree': 1}

In [12]:
print(get_score(xgb.XGBClassifier(**params_xgb_best),X,y).mean())

0.8091683881561522


In [13]:
rf_model = RandomForestClassifier(criterion='entropy', 
                                  n_estimators=446,
                                  min_samples_split=2,
                                  min_samples_leaf=7,
                                  oob_score=True,
                                  max_depth=57,
                                  random_state=1,
                                  max_features=None,
                                  n_jobs=-1)

In [14]:
print(get_score(rf_model,X,y).mean())

0.7984670798241432


In [15]:
lgbm_model = lgbm.LGBMClassifier(learning_rate=0.188447278, 
                                 n_estimators=10000,
                                 num_leaves=2080,
                                 min_data_in_leaf=100,
                                 lambda_l1=5,
                                 lambda_l2=20,
                                 max_depth=11,
                                 min_gain_to_split=0.50009162978,
                                 bagging_fraction=0.829100733,
                                 bagging_freq=1,
                                 feature_fraction=0.6252699)

In [16]:
print(get_score(lgbm_model,X,y).mean())





0.7999613326977064


In [17]:
catboost_model = cb.CatBoostClassifier(objective='CrossEntropy', 
                                 colsample_bylevel=0.0999516686,
                                 depth=12,
                                 boosting_type='Plain',
                                 bootstrap_type='MVS',
                                 logging_level='Silent')

In [18]:
print(get_score(catboost_model,X,y).mean())

0.8015760898352667


Need to formally seperate categorical and continuous data again for the neural network and then spit into the training and validation datasets. Unlike the other four models, not including a validation set didn't work for the neural network model. Also, the score was higher for a 20% split than either a 5% or a 25% split. 

In [19]:
X_cont,X_cat = cont_cat_split(X)
test_cont,test_cat = cont_cat_split(test)
splits = RandomSplitter(valid_pct=0.2, seed=42)(range_of(X))

To set up the data for the nueral network using fastai's TabularPandas, determine the batch size, set up the learner, and then run the model. The highest score was with a simple model with no layers. 

In [20]:
to_train = TabularPandas(train, 
                   procs=[Normalize],
                   cat_names = X_cat,
                   cont_names = X_cont,
                   y_names='Transported',
                   y_block = CategoryBlock,
                   splits=splits)

In [21]:
dls = to_train.dataloaders(bs=64)
learn = tabular_learner(dls, metrics=accuracy)

In [22]:
learn.fit_one_cycle(25, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,0.549433,0.515349,0.734753,00:00
1,0.475018,0.470921,0.766974,00:00
2,0.449795,0.483567,0.788838,00:00
3,0.452946,0.468793,0.757768,00:00
4,0.44507,0.43945,0.785961,00:00
5,0.446907,0.456395,0.780207,00:00
6,0.450642,0.448991,0.790564,00:00
7,0.434087,0.441823,0.774453,00:00
8,0.436903,0.437133,0.793441,00:00
9,0.442483,0.445055,0.775604,00:00


To use the model to predict the results for the test dataset. Since a neural network creates a 2 column tensor prediction, the labels were created to only show the final prediction. And then they were fitted into the submission dataframe. 

In [23]:
test_dl = learn.dls.test_dl(test)
preds, _, decoded = learn.get_preds(dl=test_dl, with_decoded=True)
nn_labels = np.argmax(preds, 1)
nn_labels

tensor([1, 0, 1,  ..., 1, 1, 1])

In [24]:
pred_xgb_model = (xgb.XGBClassifier(**params_xgb_best).fit(X,y)).predict(test)
pred_rf_model = (rf_model.fit(X,y)).predict(test)
pred_lgbm_model = (lgbm_model.fit(X,y)).predict(test)
pred_catboost_model = (catboost_model.fit(X,y)).predict(test)
nn_labels = np.argmax(preds, 1)



Using average

In [25]:
final_pred = (pred_xgb_model+pred_rf_model+pred_lgbm_model+pred_catboost_model+to_np(nn_labels.squeeze()))/5

In [26]:
sample['Transported'] = final_pred

#This converts the numbers to True/False values
sample['Transported']=sample['Transported']>0.5
sample.to_csv('submit_model_ensemble.csv', index=False)

The Kaggle score for using the average of the models is 0.80897.