### Imports and setuping the variables

In [1]:
import os
import dotenv
from pathlib import Path
import pandas as pd
import xgboost as xgb

In [2]:
project_dir = str(Path().resolve().parents[0])
dotenv_path = os.path.join(project_dir, '.env')
env_var = dotenv.load_dotenv(dotenv_path)
processed_data_path = os.environ.get("PROCESSED_DATA_PATH")

In [3]:
train = pd.read_csv(os.path.join(project_dir, processed_data_path, "train.csv"), index_col="PassengerId")
x_data = train.drop(columns=["Transported"])
y_data = train["Transported"].astype("bool")
data_dmatrix = xgb.DMatrix(data=x_data,label=y_data)

test = pd.read_csv(os.path.join(project_dir, processed_data_path, "test.csv"), index_col="PassengerId")
test.drop(columns=["Transported"], inplace=True)



### Train the model with the best parameters found in the GridSearchCV at notebook "2_Modeling_XGBoost"

In [4]:
xg_reg = xgb.XGBClassifier(objective ='binary:logistic',
                           colsample_bytree = 0.55,
                           learning_rate = 0.1,
                           max_depth = 15,
                           alpha = 15,
                           reg_lambda=0.5,
                           n_estimators = 200,
                           tree_method='gpu_hist',
                           gpu_id=0)

xg_reg.fit(x_data,y_data)


XGBClassifier(alpha=15, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.55,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=15, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=200,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=15, ...)

### Convert the results for the required submission format

In [6]:
#Convert the binary output for boolean
results = list(map(bool,xg_reg.predict(test)))
#Add the converted predicted to the test/submission dataframe
test['Transported'] = results


### Save the test/submission dataframe for submission

In [7]:
test['Transported'].to_csv(os.path.join(project_dir, processed_data_path, 'submission_#3.csv'))