### Imports and setting up varibles

In [None]:
import os
import dotenv
from pathlib import Path
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
project_dir = str(Path().resolve().parents[0])
dotenv_path = os.path.join(project_dir, '.env')
env_var = dotenv.load_dotenv(dotenv_path)
processed_data_path = os.environ.get("PROCESSED_DATA_PATH")

In [None]:
train = pd.read_csv(os.path.join(project_dir, processed_data_path, "train.csv"), index_col="PassengerId")
x_data = train.drop(columns=["Transported"])
y_data = train["Transported"].astype("bool")
data_dmatrix = xgb.DMatrix(data=x_data,label=y_data)

### Try first XGBoost model with standarts parameters

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)
xg_reg = xgb.XGBRegressor(objective ='binary:logistic',
                          colsample_bytree = 0.3,
                          learning_rate = 0.1,
                          max_depth = 10,
                          alpha = 10,
                          n_estimators = 10,
                          tree_method='gpu_hist',
                          gpu_id=0)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
accuracy_score(y_test, preds)


### Create a basic folding of the dataset and do a 50-folding run to evaluate the previous model

In [None]:
kfold = KFold(n_splits=50, shuffle=True, random_state=7)

results = []
for train_ix, test_ix in  kfold.split(np.array(x_data), np.array(y_data)):
    train_X, test_X = np.array(x_data)[train_ix], np.array(x_data)[test_ix]
    train_y, test_y = np.array(y_data)[train_ix], np.array(y_data)[test_ix]

    
    xg_reg.fit(train_X,train_y)
    preds = xg_reg.predict(test_X)
    preds[preds > 0.5] = 1
    preds[preds <= 0.5] = 0
    accuracy = accuracy_score(test_y, preds)
    results.append(accuracy)
    print(f"For this run, acurracy is {accuracy*100:.2f}%")
    
print(f'Average accuracy is {np.mean(results)*100:.2f}% ({np.std(results):.2f}%)')

### Try a similar folding from the previous cell, but using functios of the XGBoost itself.

In [None]:
params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="error", as_pandas=True, seed=123)

cv_results.head()

### Start a grid search to find the best parameters for submission

In [None]:
grid_xg_reg = xgb.XGBClassifier(objective ='binary:logistic',
                                colsample_bytree = 0.55,
                                learning_rate=0.1,
                                tree_method='gpu_hist',
                                gpu_id=0)

params = [{
          'n_estimators': [10, 100, 200],
          'max_depth': [10,15,20],
          'alpha': [0,5,10,15],
          'reg_lambda': [0.5,1,1.5]      
          }]

xg_grid = GridSearchCV(estimator=grid_xg_reg, scoring='accuracy', param_grid=params, cv=5, verbose=2)

xg_grid.fit(X_train, y_train)

print(xg_grid.best_params_)

print(xg_grid.score(X_train, y_train))
