# Imports

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

# Data loading

In [49]:
students_dropout_and_academic_success = pd.read_csv("../../data/predict-students-dropout-and-academic-success/data.csv", delimiter=";")
students_dropout_and_academic_success

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


# Definition of domain knowledge

# Analysis and visualizations

In [50]:
pd.DataFrame({
    "Column Name": students_dropout_and_academic_success.columns,
    "Min value": students_dropout_and_academic_success.min(),
    "Max value": students_dropout_and_academic_success.max()
})

Unnamed: 0,Column Name,Min value,Max value
Marital status,Marital status,1,6
Application mode,Application mode,1,57
Application order,Application order,0,9
Course,Course,33,9991
Daytime/evening attendance,Daytime/evening attendance,0,1
Previous qualification,Previous qualification,1,43
Previous qualification (grade),Previous qualification (grade),95.0,190.0
Nacionality,Nacionality,1,109
Mother's qualification,Mother's qualification,1,44
Father's qualification,Father's qualification,1,44


# Data preprocessing

In [51]:
missing_values = students_dropout_and_academic_success.isnull()
missing_values = missing_values.sum()
print(missing_values)

Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship holder                                0
Age at enrol

## Remove missing values

## Map all booleans to their data type

In [52]:
print(students_dropout_and_academic_success.dtypes)

Marital status                                      int64
Application mode                                    int64
Application order                                   int64
Course                                              int64
Daytime/evening attendance                          int64
Previous qualification                              int64
Previous qualification (grade)                    float64
Nacionality                                         int64
Mother's qualification                              int64
Father's qualification                              int64
Mother's occupation                                 int64
Father's occupation                                 int64
Admission grade                                   float64
Displaced                                           int64
Educational special needs                           int64
Debtor                                              int64
Tuition fees up to date                             int64
Gender        

In [53]:
boolean_columns = ["Daytime/evening attendance", "Previous qualification"]
students_dropout_and_academic_success[boolean_columns] = students_dropout_and_academic_success[boolean_columns].astype(bool)

## Map categorical values to string representations

## Show preprocessed data

In [54]:
students_dropout_and_academic_success

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,True,True,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,True,True,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,True,True,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,True,True,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,False,True,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,True,True,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,True,True,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,True,True,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,True,True,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


# Experiments

## Global parameters

In [55]:
seed = 1183743
test_size = 0.2
shuffle_train_test = True
drop_features = ["Target"]
scaler = StandardScaler()
scale_features = ["Curricular units 1st sem (credited)", "Curricular units 2nd sem (credited)","Unemployment rate", "Inflation rate", "GDP"]
category_encoder = OneHotEncoder()
#category_features = ["Marital status","Application mode","Application order","Course","Daytime/evening attendance","Previous qualification","Previous qualification (grade)","Nacionality","Mother's qualification","Father's qualification","Mother's occupation","Father's occupation","Displaced","Educational special needs","Debtor","Tuition fees up to date","Gender","Scholarship holder","Age at enrollment","International","Curricular units 1st sem (credited)","Curricular units 1st sem (enrolled)","Curricular units 1st sem (evaluations)","Curricular units 1st sem (approved)","Curricular units 1st sem (grade)","Curricular units 1st sem (without evaluations)","Curricular units 2nd sem (enrolled)","Curricular units 2nd sem (evaluations)","Curricular units 2nd sem (approved)","Curricular units 2nd sem (grade)","Curricular units 2nd sem (without evaluations)"
#]
category_features = []

In [56]:
# ensure that dropped features are not encoded
scale_features = list(filter(lambda feature: feature not in drop_features, scale_features))
category_features = list(filter(lambda feature: feature not in drop_features, category_features))
scale_features

['Curricular units 1st sem (credited)',
 'Curricular units 2nd sem (credited)',
 'Unemployment rate',
 'Inflation rate',
 'GDP']

## Metrics function

In [57]:
def calculate_performance_metrics(y_test, y_pred, class_names=None):        
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    
    return pd.Series({
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })
    
def calculate_per_class_accuracy(y_test, y_pred, class_names=None):
    if class_names is None:
        class_names = y_test.unique()
        
    present_classes = list(filter(lambda name: y_test.str.contains(name).any(), class_names))
    
    cm = confusion_matrix(y_test, y_pred)
    accuracies = np.divide(cm.diagonal(), cm.sum(axis=1))
    
    return pd.Series(accuracies[~np.isnan(accuracies)], index=present_classes)

## Data

### Split features and target value

In [58]:
X = students_dropout_and_academic_success.drop(drop_features, axis=1)
y = students_dropout_and_academic_success["Target"]

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, shuffle=shuffle_train_test)

### Additional preprocessing for training and evaluation

In [60]:
column_transformer = make_column_transformer(
    (scaler, scale_features),
    (category_encoder, category_features),
    remainder="passthrough"
)

X_train = pd.DataFrame(column_transformer.fit_transform(X_train), columns=column_transformer.get_feature_names_out())
X_test = pd.DataFrame(column_transformer.transform(X_test), columns=column_transformer.get_feature_names_out())

## Classifier 1

### Parameters

In [61]:
n_trees = 90

### Training

In [62]:
rf_classifier = RandomForestClassifier(n_estimators=n_trees, random_state=seed)
rf_classifier.fit(X_train, y_train)

### Prediction

In [63]:
y_pred = rf_classifier.predict(X_test)

### Evaluation

In [64]:
calculate_performance_metrics(y_test, y_pred)

Accuracy     0.768535
Precision    0.749933
Recall       0.768535
F1 Score     0.750671
dtype: float64

In [65]:
Target_mapping = {
    1: "dropout",
    2: "Graduate"
}

In [66]:
calculate_per_class_accuracy(y_test, y_pred)

Enrolled    0.761494
Graduate    0.296296
Dropout     0.929701
dtype: float64

## Classifier 2

### Parameters

In [67]:
n_neighbors = 4

### Training

In [68]:
sgd_classifier = SGDClassifier()
sgd_classifier.fit(X_train, y_train)

### Prediction

In [69]:
y_pred = sgd_classifier.predict(X_test)
y_pred

array(['Graduate', 'Graduate', 'Graduate', ..., 'Graduate', 'Graduate',
       'Graduate'], dtype='<U8')

### Evaluation

In [70]:
calculate_performance_metrics(y_test, y_pred)

Accuracy     0.521700
Precision    0.498813
Recall       0.521700
F1 Score     0.377824
dtype: float64

In [71]:
calculate_per_class_accuracy(y_test, y_pred)

Enrolled    0.043103
Graduate    0.005291
Dropout     0.985940
dtype: float64

## Classifier 3

### Parameters

### Training

In [75]:
mlp_classifier = MLPClassifier()
mlp_classifier.fit(X_train, y_train)

### Prediction

In [76]:
y_pred = mlp_classifier.predict(X_test)
y_pred

array(['Enrolled', 'Enrolled', 'Graduate', ..., 'Enrolled', 'Enrolled',
       'Enrolled'], dtype='<U8')

### Evaluation

In [77]:
calculate_performance_metrics(y_test, y_pred)

Accuracy     0.572333
Precision    0.693209
Recall       0.572333
F1 Score     0.598548
dtype: float64