# XgBoost RF model for classification

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, make_scorer


### Loading data...

In [2]:
df1 = pd.read_csv("selected_data.csv")

df2 = pd.read_csv("selected_data_w_intuition.csv")

In [3]:
X1 = df1.drop('salary', axis = 1)

X2 = df2.drop('salary', axis = 1)

y1 = df1['salary']

y2 = df2['salary']

In [4]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.2, random_state=12)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state=12)

## Defining model and GridSearch

In [5]:
xgb = XGBClassifier()
print(xgb)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)


In [6]:
params = {
    'n_estimators': [300, 400],  # Number of trees
    'max_depth': [4, 5],         # Maximum depth of each tree
    'learning_rate': [0.1],  # Learning rate
    'subsample': [0.8, 0.9],   # Fraction of samples used for fitting the trees
}

In [7]:
f1_scorer = make_scorer(f1_score, average='weighted')

In [8]:
grid_search = GridSearchCV(estimator=xgb, param_grid=params, scoring=f1_scorer, cv=5, n_jobs=-1)

# Model on dataset 1 training

In [9]:
grid_search.fit(X1_train, y1_train)

In [10]:
best_params = grid_search.best_params_
print(best_params)

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.8}


In [12]:
best_model = grid_search.best_estimator_
y1_pred = best_model.predict(X1_test)
f1 = f1_score(y1_test, y1_pred, average='weighted')

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y1_test, y1_pred)

print(f1)
print(accuracy)

0.8638306122219156
0.8681640625


# Model on dataset 2 training 

In [13]:
grid_search = GridSearchCV(estimator=xgb, param_grid=params, scoring=f1_scorer, cv=5, n_jobs=-1)

In [14]:
grid_search.fit(X2_train, y2_train)

In [15]:
best_params = grid_search.best_params_
print(best_params)

{'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400, 'subsample': 0.9}


In [16]:
best_model = grid_search.best_estimator_
y2_pred = best_model.predict(X2_test)
f1 = f1_score(y2_test, y2_pred, average='weighted')

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y2_test, y2_pred)

print(f1)
print(accuracy)

0.863941927584499
0.8683268229166666
