## Prediction student's dropout and academic success 

We aim to create a machine learning algorithm that can anticipate a student's academic progress after the first semester, with a particular emphasis on forecasting whether they will successfully complete the course or withdraw.

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
pd.options.display.max_columns = 100
import warnings 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

from tqdm.notebook import tqdm
import pickle

In [2]:
# Loading the Data
df = pd.read_csv("./dataset.csv", delimiter=';')

In [3]:
# The "Enrolled" values will not be used in the model
df = df[df.Target != "Enrolled"]

In [4]:
# Rename columns
df = df.rename(columns= {'Marital status': 'Marital_Status',
                    'Application mode': 'Application_Mode',
                    'Application order': 'Application_Order',
                    'Daytime/evening attendance': 'Day_Ev_Attend',
                    'Previous qualification': 'Previous_Qualification',
                    'Nacionality': 'Nationality',
                    "Mother's qualification": 'Mother_Qualification',
                    "Father's qualification": 'Father_Qualification',
                    "Mother's occupation": 'Mother_Occupation',
                    "Father's occupation": 'Father_Occupation',
                    'Educational special needs': 'Special_Needs',
                    'Tuition fees up to date': 'Tuition_Fees',
                    'Scholarship holder': 'Scholarship',
                    'Age at enrollment': 'Enrollment_Age',
                    'Curricular units 1st sem (credited)': 'CU_1st_Credited',
                    'Curricular units 1st sem (enrolled)': 'CU_1st_Enrolled',
                    'Curricular units 1st sem (evaluations)': 'CU_1st_Evaluations',
                    'Curricular units 1st sem (approved)': 'CU_1st_Approved',
                    'Curricular units 1st sem (grade)': 'CU_1st_Grade',
                    'Curricular units 1st sem (without evaluations)': 'CU_1st_NoEvaluations',
                    'Curricular units 2nd sem (credited)': 'CU_2nd_Credited',
                    'Curricular units 2nd sem (enrolled)': 'CU_2nd_Enrolled',
                    'Curricular units 2nd sem (evaluations)': 'CU_2nd_Evaluations',
                    'Curricular units 2nd sem (approved)': 'CU_2nd_Approved',
                    'Curricular units 2nd sem (grade)': 'CU_2nd_Grade',
                    'Curricular units 2nd sem (without evaluations)': 'CU_2nd_NoEvaluations',
                    'Unemployment rate': 'Unemploymen_Rate',
                    'Inflation rate': 'Inflation_Rate',
                   })

In [5]:
df['Target'] = df['Target'].map({'Dropout':0, 'Graduate':1})

In [6]:
#Looking at the corelation, we need to select the required columns for prediction.
df.corr()['Target']

Marital_Status           -0.100479
Application_Mode         -0.233888
Application_Order         0.094355
Course                    0.006814
Day_Ev_Attend             0.084496
Previous_Qualification   -0.102795
Nationality              -0.003823
Mother_Qualification     -0.048459
Father_Qualification     -0.003850
Mother_Occupation         0.064195
Father_Occupation         0.073238
Displaced                 0.126113
Special_Needs            -0.007254
Debtor                   -0.267207
Tuition_Fees              0.442138
Gender                   -0.251955
Scholarship               0.313018
Enrollment_Age           -0.267229
International             0.006181
CU_1st_Credited           0.046900
CU_1st_Enrolled           0.161074
CU_1st_Evaluations        0.059786
CU_1st_Approved           0.554881
CU_1st_Grade              0.519927
CU_1st_NoEvaluations     -0.074642
CU_2nd_Credited           0.052402
CU_2nd_Enrolled           0.182897
CU_2nd_Evaluations        0.119239
CU_2nd_Approved     

In [7]:
df.columns

Index(['Marital_Status', 'Application_Mode', 'Application_Order', 'Course',
       'Day_Ev_Attend', 'Previous_Qualification', 'Nationality',
       'Mother_Qualification', 'Father_Qualification', 'Mother_Occupation',
       'Father_Occupation', 'Displaced', 'Special_Needs', 'Debtor',
       'Tuition_Fees', 'Gender', 'Scholarship', 'Enrollment_Age',
       'International', 'CU_1st_Credited', 'CU_1st_Enrolled',
       'CU_1st_Evaluations', 'CU_1st_Approved', 'CU_1st_Grade',
       'CU_1st_NoEvaluations', 'CU_2nd_Credited', 'CU_2nd_Enrolled',
       'CU_2nd_Evaluations', 'CU_2nd_Approved', 'CU_2nd_Grade',
       'CU_2nd_NoEvaluations', 'Unemploymen_Rate', 'Inflation_Rate', 'GDP',
       'Target'],
      dtype='object')

In [8]:
# Selection of data for machine learning, taking into account multicorrelation from EDA and the goal of the project
df = df[[#'Marital_Status', 
         'Application_Mode', 
         #'Application_Order', 
         #'Course', 
         #'Day_Ev_Attend', 
         'Previous_Qualification', 
         #'Nationality',
         #'Mother_Qualification', 
         #'Father_Qualification', 
         #'Mother_Occupation', 'Father_Occupation', 'Displaced', 
         #'Special_Needs', 
         'Debtor',
         'Tuition_Fees', 
         'Gender', 
         'Scholarship', 
         'Enrollment_Age',
         #'International', 
         #'CU_1st_Credited', 
         #'CU_1st_Enrolled', 
         #'CU_1st_Evaluations', 
         'CU_1st_Approved', 
         #'CU_1st_Grade',
         #'CU_1st_NoEvaluations', 
         #'CU_2nd_Credited', 'CU_2nd_Enrolled',
         #'CU_2nd_Evaluations', 'CU_2nd_Approved', 'CU_2nd_Grade',
         #'CU_2nd_NoEvaluations', 
         #'Unemploymen_Rate', 'Inflation_Rate', 'GDP',
        'Target']].copy()

In [9]:
df.shape

(3630, 9)

In [10]:
# Selecting independent and dependent variables
X = df.drop("Target", axis=1)
y = df.Target

In [11]:
# Prepare the cross-validation procedure
cv = KFold(n_splits=5, random_state=1, shuffle=True)

In [12]:
def model_tester(model_dict, X, y, cv=5):    
    scoring_param = ["accuracy", "precision", "recall", "f1_micro", "f1_macro"]
    cv_score = {} # To append Scores of each Model
    warnings.filterwarnings("ignore")
    np.random.seed(42)
    for name, model in tqdm(model_dict.items()):
        # enumerating through All Models in Dictionery
        cv_score[name] = {}
        for param in scoring_param:
            # Calculating Mean values for cross validation with each Parameter
            score = np.mean(cross_val_score(model, X, y, scoring=param, cv=cv))
            cv_score[name][param] = score 
    cv = pd.DataFrame(data=cv_score)
    return cv

In [13]:
model_dict = {'Logistc Regression' : LogisticRegression(),
              'Random Forest' : RandomForestClassifier(),
              'Balanced Random Forest' : BalancedRandomForestClassifier(),
              'KNN' : KNeighborsClassifier(),
              'SVM': SVC(),
              'XGBOOST': XGBClassifier()}

model_tester(model_dict, X, y, cv=cv)

  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,Logistc Regression,Random Forest,Balanced Random Forest,KNN,SVM,XGBOOST
accuracy,0.860606,0.846281,0.841873,0.83168,0.841598,0.85124
precision,0.858474,0.854389,0.873773,0.831427,0.824737,0.8585
recall,0.923704,0.910246,0.868328,0.907213,0.939163,0.904835
f1_micro,0.860606,0.848485,0.841047,0.83168,0.841598,0.85124
f1_macro,0.84977,0.838368,0.832926,0.817776,0.825529,0.841179


Logistic regression is the most optimal model. The next step is to tune this model by changing the model's hyper parameters.

In [14]:
# Setting grid for logistic regression parameters 
lr_grid = {"solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
           "penalty" : ["l1", "l2", "elasticnet"],
           "C" : np.logspace(-4, 4, 30),
           "class_weight" : ["balanced", "None"]}

gs_log_reg_model = GridSearchCV(LogisticRegression(),
                                param_grid=lr_grid,
                                cv=cv,
                                verbose=1)
gs_log_reg_model.fit(X, y)

Fitting 5 folds for each of 900 candidates, totalling 4500 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': array([1.00000000e-04, 1.88739182e-04, 3.56224789e-04, 6.72335754e-04,
       1.26896100e-03, 2.39502662e-03, 4.52035366e-03, 8.53167852e-03,
       1.61026203e-02, 3.03919538e-02, 5.73615251e-02, 1.08263673e-01,
       2.04335972e-01, 3.85662042e-01, 7.27895384e-01, 1.3738238...,
       2.59294380e+00, 4.89390092e+00, 9.23670857e+00, 1.74332882e+01,
       3.29034456e+01, 6.21016942e+01, 1.17210230e+02, 2.21221629e+02,
       4.17531894e+02, 7.88046282e+02, 1.48735211e+03, 2.80721620e+03,
       5.29831691e+03, 1.00000000e+04]),
                         'class_weight': ['balanced', 'None'],
                         'penalty': ['l1', 'l2', 'elasticnet'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             verbose=1)

In [15]:
print("Tuned hyperparameters for logistic regression:", gs_log_reg_model.best_params_) 

Tuned hyperparameters for logistic regression: {'C': 4.893900918477489, 'class_weight': 'None', 'penalty': 'l2', 'solver': 'newton-cg'}


In [19]:
# Run the logistic regression model with tuned hyperparameters
tuned_model_dict = {"Logistic Regression" : LogisticRegression(penalty='l2',
                                                              solver='newton-cg',
                                                              C=4.893900918477489)}
model_tester(tuned_model_dict, X, y, cv=cv)

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Logistic Regression
accuracy,0.860882
f1_macro,0.85011
f1_micro,0.860882
precision,0.858805
recall,0.923704


The model, in the form of a python object, was converted into a character stream using pickling. The purpose of this was to include all the information required to reconstruct the object in a different python script.

In [20]:
pickle.dump(gs_log_reg_model, open('model.p','wb'))
model = pickle.load(open('model.p','rb'))

In [21]:
print(model.predict([[8, 1, 1, 1, 0, 0, 25, 6]]))

[1]


We can use this file in our production application to estimate the success rate of a student by providing information about the student.