# Will I be able to graduate??

With recent advancements in generative AI and the modern state-of-arts not taking any breaks in beating one another. It can be tough being a data science student, trying to keep up with the knowledge to prevent it from depreciating so much.

Will I be able to graduate or will I drop out?? Well, let's build a model to find out!

We will be using the UCI Machine Learning Repositry to extract dataset id 697, which contains the profiles of university students, like gender, admission grade, or even their mother's occupation, with the target variable of whether they are enrolled, graduated or dropped out.

We will be using the supervised classification method XGBoost, which is going to also give us explainability on which features had higher impact/importance, and in the end, I will be entering my own data into the model to see if I can graduate this course or not!

In [None]:
# Import Packages

from ucimlrepo import fetch_ucirepo
import sklearn
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from matplotlib import pyplot as plt

pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)

In [None]:
# Extract Data

base = fetch_ucirepo(id=697)
X = base.data.features
y = base.data.targets

In [None]:
# We are only interested in the target output of graduated and dropped out, label to int

print(len(X),len(y))
X = X.drop(y[y['Target']=='Enrolled'].index).copy()
y = y.drop(y[y['Target']=='Enrolled'].index).copy()
print(len(X),len(y))
y = y.replace(['Graduate','Dropout'],[1,0]).astype('int')

In [None]:
# Check for missing values

for col in X.columns:
    if len(X[X[col].isna()==True]) != 0:
        print(col)


### Data Preprocessing

In [None]:
# Firstly let's find out discrete variables, hot encode variables, and normalise continuous variables

discrete_var = ['Marital Status','Application mode','Course','Daytime/evening attendance',
   'Previous qualification','Nacionality',"Mother's qualification", "Father's qualification",
   "Mother's occupation","Father's occupation",'Displaced','Educational special needs',
'Debtor','Tuition fees up to date','Gender','Scholarship holder','International']

X[discrete_var] = X[discrete_var].astype('str')
continuous_var = list(set(X.columns).difference(set(discrete_var)))

scale = StandardScaler()
X[continuous_var] = scale.fit_transform(X[continuous_var])

features = X[continuous_var].join(pd.get_dummies(X[['Marital Status','Application mode','Course','Daytime/evening attendance',
   'Previous qualification','Nacionality',"Mother's qualification", "Father's qualification",
   "Mother's occupation","Father's occupation",'Displaced','Educational special needs',
'Debtor','Tuition fees up to date','Gender','Scholarship holder','International']]))

In [None]:
# Train Test Split

X_train,X_test,y_train,y_test = train_test_split(features, y,test_size = 0.2,random_state = 123)

### Training XGB Model

In [None]:
xgb_classifier = xgb.XGBClassifier(n_estimators=1000, eta=0.1, max_depth=3)
xgb_classifier.fit(X_train, y_train)

In [None]:
pred_train = xgb_classifier.predict(X_train)
pred_test = xgb_classifier.predict(X_test)

### Evaluation

In [None]:
def xgb_eval(y_train, pred_train,y_test, pred_test):
    print('Train Evaluation')
    confusion_matrix = sklearn.metrics.confusion_matrix(y_train, pred_train)
    cm_display = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
    print('Accuracy:',sklearn.metrics.accuracy_score(y_train, pred_train).round(2))
    print('Precision:',sklearn.metrics.precision_score(y_train, pred_train).round(2))
    print('Recall:',sklearn.metrics.recall_score(y_train, pred_train).round(2))
    print('f1:',sklearn.metrics.f1_score(y_train, pred_train).round(2))
    cm_display.plot()
    plt.show()

    print('Test Evaluation')
    confusion_matrix = sklearn.metrics.confusion_matrix(y_test, pred_test)
    cm_display = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
    print('Accuracy:',sklearn.metrics.accuracy_score(y_test, pred_test).round(2))
    print('Precision:',sklearn.metrics.precision_score(y_test, pred_test).round(2))
    print('Recall:',sklearn.metrics.recall_score(y_test, pred_test).round(2))
    print('f1:',sklearn.metrics.f1_score(y_test, pred_test).round(2))
    cm_display.plot()
    plt.show()

    print('AUC Train:',sklearn.metrics.roc_auc_score(y_train, pred_train).round(2))
    print('AUC Test:',sklearn.metrics.roc_auc_score(y_test, pred_test).round(2))
    fpr1, tpr1, thresholds1 = sklearn.metrics.roc_curve(y_train, pred_train)
    fpr2, tpr2, thresholds2 = sklearn.metrics.roc_curve(y_test, pred_test)
    plt.plot(fpr1, tpr1)
    plt.plot(fpr2, tpr2)
    plt.show()

In [None]:
xgb_eval(y_train, pred_train,y_test, pred_test)

### Simplify the model

In [None]:
sorted_idx = xgb_classifier.feature_importances_.argsort()
plt.barh(X_train.columns[sorted_idx][-10:],xgb_classifier.feature_importances_[sorted_idx][-10:])
plt.title('XGB Feature Importance')

In [None]:
newvar = ['Curricular units 2nd sem (approved)','Tuition fees up to date_0','Curricular units 2nd sem (enrolled)',
   'Curricular units 1st sem (approved)','Scholarship holder_0','Debtor_0','Curricular units 1st sem (enrolled)',
   'Gender_0','Course_9130']

xgb_classifier2 = xgb.XGBClassifier(n_estimators=1000, eta=0.1, max_depth=3)
xgb_classifier2.fit(X_train[newvar], y_train)

pred_train2 = xgb_classifier2.predict(X_train[newvar])
pred_test2 = xgb_classifier2.predict(X_test[newvar])

In [None]:
xgb_eval(y_train, pred_train2,y_test, pred_test2)

### Moment of truths, will I graduate?

In [None]:
mydata = X_train[newvar].head(1).copy()
mydata['Curricular units 2nd sem (approved)'] = 3
mydata['Tuition fees up to date_0'] = False
mydata['Curricular units 2nd sem (enrolled)'] = 3
mydata['Curricular units 1st sem (approved)'] = 3
mydata['Scholarship holder_0'] = True
mydata['Debtor_0'] = True
mydata['Curricular units 1st sem (enrolled)'] = 3
mydata['Gender_0'] = False
mydata['Course_9130'] = True

cont_val = ['Curricular units 2nd sem (approved)','Curricular units 2nd sem (enrolled)','Curricular units 1st sem (approved)','Curricular units 1st sem (enrolled)']

mydata[cont_val] = scale.fit_transform(mydata[cont_val])

xgb_classifier2.predict(mydata)


Yes!