In [1]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

filename = "datasets/original_dataset.xlsx"
df = pd.read_excel(filename, engine='openpyxl')

df

Unnamed: 0,UniqueID,submission_year,target,TrainVal,Long_1,Long_2,Long_3,Long_4,Long_5,Long_6,...,Short_1,Short_2,Short_3,Short_4,Short_5,Short_6,Short_7,Short_8,Short_9,Short_10
0,984TAH,2015,0,Train_60,1800.0,6.0,0.0,221.0,0.0,15.0,...,,,,,,,,,,
1,410VKN,2015,0,Val_40,5700.0,8.0,0.0,221.0,12.0,15.0,...,,,,,,,,,,
2,394ETK,2015,1,Train_60,700.0,1.0,0.0,147.0,17.0,10.0,...,,,,,,,,,,
3,036KQK,2015,0,Train_60,1700.0,2.0,0.0,461.0,187.0,6.0,...,,,,,,,,,,
4,996RNP,2015,0,Train_60,600.0,3.0,0.0,96.0,30.0,11.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4594,757VJZ,2017,0,Train_60,3000.0,14.0,0.0,414.0,37.0,10.5,...,0.138413,0.496467,71.118364,8192.698333,83.009946,184541.42500,131154.80,48.671647,0.061366,0.666667
4595,538JZF,2017,1,Train_60,1600.0,0.0,,338.0,,1.0,...,0.108609,0.630414,50.562971,41071.880000,73.326164,88518.18143,47621.34,58.136132,0.416919,0.000000
4596,648WHI,2017,1,Val_40,100.0,3.0,6741.0,281.0,198.0,3.0,...,1.341142,0.985568,41.803426,1587.646667,34.919227,35035.94000,32704.26,11.122347,0.064327,0.666667
4597,899YZB,2017,1,Val_40,300.0,0.0,9411.0,173.0,44.0,1.0,...,1.419549,1.317842,48.831847,15061.066670,42.615211,186000.48000,169718.98,39.008325,0.053133,0.666667


In [2]:
# Separate data into training and validation sets
train_df = df[df['TrainVal'] == 'Train_60']
val_df = df[df['TrainVal'] == 'Val_40']

# Separate features and target variable
X_train = train_df.drop(columns=['UniqueID', 'submission_year', 'target', 'TrainVal'])
y_train = train_df['target']

X_val = val_df.drop(columns=['UniqueID','submission_year', 'target', 'TrainVal'])
y_val = val_df['target']

In [3]:
# Train the XGBoost model with hyper parameter tuning

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6],
    'n_estimators': [50, 100, 150],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1],
}
clf = xgb.XGBClassifier()
grid_search = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=3, verbose=1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print(grid_search.best_params_)

best_clf = grid_search.best_estimator_

# Predict using the best model
# y_pred = best_clf.predict(X_val_new)



Fitting 3 folds for each of 324 candidates, totalling 972 fits
{'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}


In [12]:
# Predicting class probabilities
# Each row represents two scores: [probability_of_class_0, probability_of_class_1]
train_scores = best_clf.predict_proba(X_train)[:, 1]
val_scores = best_clf.predict_proba(X_val)[:, 1]
print(X_val.shape)
# If you're interested in probabilities of class 1 (usually the "positive" class)
# Display scores
train_df['scores'] = train_scores
val_df['scores'] = val_scores
stage_2_df = pd.concat([train_df, val_df], axis=0).sort_values(by='UniqueID')

stage_2_df

(1831, 20)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['scores'] = train_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['scores'] = val_scores


Unnamed: 0,UniqueID,submission_year,target,TrainVal,Long_1,Long_2,Long_3,Long_4,Long_5,Long_6,...,Short_2,Short_3,Short_4,Short_5,Short_6,Short_7,Short_8,Short_9,Short_10,scores
287,000JLK,2015,0,Train_60,2600.0,5.0,0.0,126.0,0.0,5.0,...,,,,,,,,,,0.227833
4560,000MYV,2017,0,Val_40,,7.0,,270.0,,0.0,...,0.850654,32.824971,12452.67,54.237313,52134.66667,38000.00,16.432770,0.207399,1.000000,0.168442
2531,000NGA,2016,1,Train_60,2400.0,0.0,0.0,320.0,35.0,0.0,...,,,,,,,,,,0.179593
1770,001JAD,2015,0,Val_40,5900.0,2.0,0.0,593.0,12.0,6.0,...,,,,,,,,,,0.229519
3098,001QJB,2016,1,Val_40,1400.0,4.0,10000.0,430.0,58.0,21.5,...,0.605139,37.436860,7062.76,43.288374,31243.19667,24115.29,51.712998,0.185573,0.000000,0.327295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2691,999IDD,2016,0,Val_40,1800.0,0.0,0.0,181.0,13.0,0.0,...,,,,,,,,,,0.217233
2127,999MZJ,2016,0,Train_60,1400.0,3.0,,459.0,,7.0,...,,,,,,,,,,0.217025
4154,999PTP,2017,0,Train_60,6000.0,3.0,6417.0,180.0,23.0,2.0,...,1.834167,47.445984,33636.93,62.468607,133695.48430,58880.89,38.087828,0.179593,0.428571,0.203536
2596,999SMW,2016,1,Val_40,1700.0,7.0,,319.0,,2.5,...,,,,,,,,,,0.280871
