In [1]:
import os
import dotenv
from pathlib import Path
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
project_dir = str(Path().resolve().parents[0])
dotenv_path = os.path.join(project_dir, '.env')
env_var = dotenv.load_dotenv(dotenv_path)
processed_data_path = os.environ.get("PROCESSED_DATA_PATH")

In [3]:
train = pd.read_csv(os.path.join(project_dir, processed_data_path, "train.csv"), index_col="PassengerId")
x_data = train.drop(columns=["Transported"])
y_data = train["Transported"].astype("bool")
data_dmatrix = xgb.DMatrix(data=x_data,label=y_data)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 10, tree_method='gpu_hist', gpu_id=0)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0
accuracy_score(y_test, preds)


0.78205865439908

In [5]:
kfold = KFold(n_splits=50, shuffle=True, random_state=7)

Accuracy: 0.29%,(2.96%)


In [6]:
results = []
for train_ix, test_ix in  kfold.split(np.array(x_data), np.array(y_data)):
    train_X, test_X = np.array(x_data)[train_ix], np.array(x_data)[test_ix]
    train_y, test_y = np.array(y_data)[train_ix], np.array(y_data)[test_ix]

    
    xg_reg.fit(train_X,train_y)
    preds = xg_reg.predict(test_X)
    preds[preds > 0.5] = 1
    preds[preds <= 0.5] = 0
    accuracy = accuracy_score(test_y, preds)
    results.append(accuracy)
    print(f"For this run, acurracy is {accuracy*100:.2f}%")
    
print(f'Average accuracy is {np.mean(results)*100:.2f}% ({np.std(results):.2f}%)')

For this run, acurracy is 81.03%
For this run, acurracy is 79.31%
For this run, acurracy is 79.31%
For this run, acurracy is 79.89%
For this run, acurracy is 78.16%
For this run, acurracy is 72.41%
For this run, acurracy is 82.76%
For this run, acurracy is 75.86%
For this run, acurracy is 81.03%
For this run, acurracy is 75.86%
For this run, acurracy is 72.99%
For this run, acurracy is 81.61%
For this run, acurracy is 75.29%
For this run, acurracy is 84.48%
For this run, acurracy is 78.16%
For this run, acurracy is 75.86%
For this run, acurracy is 77.59%
For this run, acurracy is 78.16%
For this run, acurracy is 78.16%
For this run, acurracy is 74.14%
For this run, acurracy is 73.56%
For this run, acurracy is 81.03%
For this run, acurracy is 78.74%
For this run, acurracy is 79.89%
For this run, acurracy is 77.01%
For this run, acurracy is 79.31%
For this run, acurracy is 74.71%
For this run, acurracy is 79.31%
For this run, acurracy is 77.01%
For this run, acurracy is 77.59%
For this r

In [7]:
params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="error", as_pandas=True, seed=123)

cv_results.head()

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.411883,0.005277,0.419188,0.002602
1,0.271022,0.011679,0.278387,0.01455
2,0.2456,0.005467,0.253653,0.011905
3,0.243587,0.008838,0.249626,0.020733
4,0.230013,0.004907,0.234327,0.014859


In [14]:
grid_xg_reg = xgb.XGBClassifier(objective ='binary:logistic', alpha=10, tree_method='gpu_hist', gpu_id=0)

params = [{
          'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5], 
          'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001],
          'max_depth': [10,15,20],
          # 'alpha': 10,
          'n_estimators': [10,20]
         }]

xg_grid = GridSearchCV(estimator=grid_xg_reg, scoring='accuracy', param_grid=params, cv=10, verbose=2)

xg_grid.fit(X_train, y_train)

print(xg_grid.best_params_)

print(xg_grid.score(X_train, y_train))


Fitting 10 folds for each of 150 candidates, totalling 1500 fits
[CV] END colsample_bytree=0.1, learning_rate=0.1, max_depth=10, n_estimators=10; total time=   0.2s
[CV] END colsample_bytree=0.1, learning_rate=0.1, max_depth=10, n_estimators=10; total time=   0.2s
[CV] END colsample_bytree=0.1, learning_rate=0.1, max_depth=10, n_estimators=10; total time=   0.2s
[CV] END colsample_bytree=0.1, learning_rate=0.1, max_depth=10, n_estimators=10; total time=   0.2s
[CV] END colsample_bytree=0.1, learning_rate=0.1, max_depth=10, n_estimators=10; total time=   0.2s
[CV] END colsample_bytree=0.1, learning_rate=0.1, max_depth=10, n_estimators=10; total time=   0.2s
[CV] END colsample_bytree=0.1, learning_rate=0.1, max_depth=10, n_estimators=10; total time=   0.2s
[CV] END colsample_bytree=0.1, learning_rate=0.1, max_depth=10, n_estimators=10; total time=   0.2s
[CV] END colsample_bytree=0.1, learning_rate=0.1, max_depth=10, n_estimators=10; total time=   0.2s
[CV] END colsample_bytree=0.1, lear