In [38]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import seaborn as sns
import xgboost as xgb

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from ucimlrepo import fetch_ucirepo 

In [54]:
# Dataset for student outcomes

# Fetch dataset 
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
  
# Data (as pandas dataframes) 
X = predict_students_dropout_and_academic_success.data.features 
y = predict_students_dropout_and_academic_success.data.targets 
  
# Metadata; commenting out because it's a lot!
# This code calling up .metadata is specific to this dataframe,
# would not work in most pandas dataframes
# print(predict_students_dropout_and_academic_success.metadata) 
  
# Variable information
# NOTE: This code works with this dataframe, but typically when
# working in Pandas, one would pull up information on the 
# columns/variables using data_frame_name.info()
print(predict_students_dropout_and_academic_success.variables) 

                                              name     role         type  \
0                                   Marital Status  Feature      Integer   
1                                 Application mode  Feature      Integer   
2                                Application order  Feature      Integer   
3                                           Course  Feature      Integer   
4                       Daytime/evening attendance  Feature      Integer   
5                           Previous qualification  Feature      Integer   
6                   Previous qualification (grade)  Feature   Continuous   
7                                      Nacionality  Feature      Integer   
8                           Mother's qualification  Feature      Integer   
9                           Father's qualification  Feature      Integer   
10                             Mother's occupation  Feature      Integer   
11                             Father's occupation  Feature      Integer   
12          

In [11]:
# Have 3 categories, but for this demonstration, want just 2
# Recoding Target into 2 labeled and 2 numeric categories
# For numeric categories, 1 = 'Late grad or drop-out'
y_recode = y.copy()
y_recode['TargetLabel'] = np.where(y_recode['Target'] == 'Graduate', 'On-time grad', 'Late grad or drop-out')
y_recode['TargetNumeric'] = np.where(y_recode['Target'] == 'Graduate', 0, 1)

In [44]:
# Set aside 20% of the data as final testing dataset
X_modeling, X_test, y_modeling, y_test = train_test_split(
    X, y_recode['TargetNumeric'], test_size=0.20, random_state=55)

In [45]:
# Get training and validation data
X_train, X_validate, y_train, y_validate = train_test_split(
    X_modeling, y_modeling, test_size=0.20, random_state=55)

In [46]:
# Fit a random forest classifier
rand_for = RandomForestClassifier(random_state=55)
rand_for.fit(X_train, y_train)

# Get predictions
rand_for_preds = rand_for.predict(X_validate)

# Print classification report
print(classification_report(rand_for_preds, y_validate))

              precision    recall  f1-score   support

           0       0.90      0.80      0.85       372
           1       0.80      0.90      0.85       336

    accuracy                           0.85       708
   macro avg       0.85      0.85      0.85       708
weighted avg       0.85      0.85      0.85       708



In [52]:
# Fit a gradient boosting classifier
grad_boost = GradientBoostingClassifier(random_state=55)
grad_boost.fit(X_train, y_train)

# Get predictions
grad_boost_preds = grad_boost.predict(X_validate)

# Print classification report
print(classification_report(grad_boost_preds, y_validate))

# NOTE: Original model I ran had these parameters:
# (n_estimators=100, learning_rate=0.1, max_depth=3, random_state=55)

              precision    recall  f1-score   support

           0       0.89      0.80      0.84       369
           1       0.81      0.89      0.85       339

    accuracy                           0.85       708
   macro avg       0.85      0.85      0.85       708
weighted avg       0.85      0.85      0.85       708



In [53]:
# NOTE: May exclude this part
# Source: https://www.datacamp.com/tutorial/xgboost-in-python
# Train a model using the scikit-learn API
xgb_classifier = xgb.XGBClassifier(n_estimators=100, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=3, enable_categorical=True)
xgb_classifier.fit(X_train, y_train)

# Convert the model to a native API model
model = xgb_classifier.get_booster()

# Get predictions
xgb_preds = xgb_classifier.predict(X_validate)

# Print classification report
print(classification_report(xgb_preds, y_validate))

              precision    recall  f1-score   support

           0       0.90      0.80      0.84       374
           1       0.80      0.90      0.84       334

    accuracy                           0.84       708
   macro avg       0.85      0.85      0.84       708
weighted avg       0.85      0.84      0.84       708

