In [80]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import seaborn as sns
import xgboost as xgb

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeRegressor
from ucimlrepo import fetch_ucirepo 

In [56]:
# Dataset for student outcomes

# Fetch dataset 
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
  
# Data (as pandas dataframes) 
X = predict_students_dropout_and_academic_success.data.features 
y = predict_students_dropout_and_academic_success.data.targets 
  
# Metadata; commenting out because it's a lot!
# This code calling up .metadata is specific to this dataframe,
# would not work in most pandas dataframes
# print(predict_students_dropout_and_academic_success.metadata) 
  
# Variable information
# NOTE: This code works with this dataframe, but typically when
# working in Pandas, one would pull up information on the 
# columns/variables using data_frame_name.info()
print(predict_students_dropout_and_academic_success.variables) 

                                              name     role         type  \
0                                   Marital Status  Feature      Integer   
1                                 Application mode  Feature      Integer   
2                                Application order  Feature      Integer   
3                                           Course  Feature      Integer   
4                       Daytime/evening attendance  Feature      Integer   
5                           Previous qualification  Feature      Integer   
6                   Previous qualification (grade)  Feature   Continuous   
7                                      Nacionality  Feature      Integer   
8                           Mother's qualification  Feature      Integer   
9                           Father's qualification  Feature      Integer   
10                             Mother's occupation  Feature      Integer   
11                             Father's occupation  Feature      Integer   
12          

In [57]:
# Have 3 categories, but for this demonstration, want just 2
# Recoding Target into 2 labeled and 2 numeric categories
# For numeric categories, 1 = 'Late grad or drop-out'
y_recode = y.copy()
y_recode['TargetLabel'] = np.where(y_recode['Target'] == 'Graduate', 'On-time grad', 'Late grad or drop-out')
y_recode['TargetNumeric'] = np.where(y_recode['Target'] == 'Graduate', 0, 1)

In [58]:
# Set aside 20% of the data as final testing dataset
X_modeling, X_test, y_modeling, y_test = train_test_split(
    X, y_recode['TargetNumeric'], test_size=0.20, random_state=55)

In [59]:
# Get training and validation data
X_train, X_validate, y_train, y_validate = train_test_split(
    X_modeling, y_modeling, test_size=0.20, random_state=55)

In [87]:
# Fit a random forest classifier
rand_for = RandomForestClassifier(random_state=55)
rand_for.fit(X_train, y_train)

# Get predictions
rand_for_preds = rand_for.predict(X_validate)

# Print f1 score, classification report
print('Rand Forest f1 score: ', f1_score(rand_for_preds, y_validate))
print('Rand Forest classification_report: \n', classification_report(rand_for_preds, y_validate))

Rand Forest f1 score:  0.847887323943662
Rand Forest classification_report: 
               precision    recall  f1-score   support

           0       0.90      0.80      0.85       372
           1       0.80      0.90      0.85       336

    accuracy                           0.85       708
   macro avg       0.85      0.85      0.85       708
weighted avg       0.85      0.85      0.85       708



In [85]:
# Fit a gradient boosting classifier
grad_boost = GradientBoostingClassifier(random_state=55)
grad_boost.fit(X_train, y_train)

# Get predictions
grad_boost_preds = grad_boost.predict(X_validate)

# Print f1 score, classification report
print('Grad boost f1 score: ', f1_score(grad_boost_preds, y_validate))
print('Grad boost classification report: \n', classification_report(grad_boost_preds, y_validate))

GradBoostClassifier f1 score:  0.847124824684432
GradBoostClassifier classification report: 
               precision    recall  f1-score   support

           0       0.89      0.80      0.84       369
           1       0.81      0.89      0.85       339

    accuracy                           0.85       708
   macro avg       0.85      0.85      0.85       708
weighted avg       0.85      0.85      0.85       708



In [84]:
# NOTE: May exclude this part
# Source: https://www.datacamp.com/tutorial/xgboost-in-python
# Train a model using the scikit-learn API
xgb_classifier = xgb.XGBClassifier(
    n_estimators=100,
    objective='binary:logistic',
    tree_method='hist',
    eta=0.1,
    max_depth=3,
    enable_categorical=True,
    random_state=55
)
xgb_classifier.fit(X_train, y_train)

# Convert the model to a native API model
model = xgb_classifier.get_booster()

# Get predictions
xgb_preds = xgb_classifier.predict(X_validate)

# Print f1 score, classification report
print('XGBoost f1 score: ', f1_score(xgb_preds, y_validate))
print('XGBoost classsification report: \n', classification_report(xgb_preds, y_validate))

XGBoost f1 score:  0.844632768361582
XGBoost classsification report: 
               precision    recall  f1-score   support

           0       0.90      0.80      0.84       374
           1       0.80      0.90      0.84       334

    accuracy                           0.84       708
   macro avg       0.85      0.85      0.84       708
weighted avg       0.85      0.84      0.84       708



In [89]:
# Hyperparameter tuning
# You can go all out hyperparameter tuning and tune for everything.
# For the sake of demonstrating hyperparameter tuning without making
# you wait too long for the code to run, below I just select a few
# hyperparameters with a couple of values to show how to set it up.
# I am using GridSearch in scikit learn, which tries out all combinations
# of hyperparameters and selects the best combination according to the
# selected metric you want to optimize for. A "quicker" version of GridSearch
# is RandomizedSearchCV, which randomly selects from among combinations to
# do a good but not exhaustive job of testing out combinations with greater
# speed. An even "smarter" way to hyperparameter tune is to use a tool like
# Optuna, which searches for hyperparameter values that do a good job with the
# metric you are optimizing for and then tries out many more values close to
# those values to more efficiently find the best combination of values...but
# Optuna is beyond the scope for these worked examples!

# Source for code example: https://www.geeksforgeeks.org/how-to-tune-hyperparameters-in-gradient-boosting-algorithm/

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.005, 0.01, 0.1, 0.2],
    'max_depth': [2, 3, 4],
}

# Initialize GridSearchCV
grid_search_grad_boost = GridSearchCV(estimator=grad_boost, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)

# Fit the model to the training data using GridSearchCV
grid_search_grad_boost.fit(X_train, y_train)

# Get the best parameters and best model
best_params_grad_boost = grid_search_grad_boost.best_params_
best_model_grad_boost = grid_search_grad_boost.best_estimator_

# Make predictions on the test set using the best model
y_pred_best_grad_boost = best_model_grad_boost.predict(X_validate)

# Evaluate the best model
f1_best_grad_boost = f1_score(y_validate, y_pred_best)
class_report_best_grad_boost = classification_report(y_validate, y_pred_best)

# Print the results
print("Grad boost best parameters: ", best_params)
print(f"Grad boost best model f1 score:  {f1_best_grad_boost}")
print(f"Grad boost best model classification report: \n{class_report_best_grad_boost}")


Best Parameters:  {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
Best Model Accuracy:  0.849083215796897
Best Model Classification Report: 
              precision    recall  f1-score   support

           0       0.80      0.90      0.85       334
           1       0.90      0.80      0.85       374

    accuracy                           0.85       708
   macro avg       0.85      0.85      0.85       708
weighted avg       0.85      0.85      0.85       708



In [None]:
# Hyperparameter tuning for random-forest
# Source for example code: https://www.geeksforgeeks.org/random-forest-hyperparameter-tuning-in-python/