# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

# Data loading

In [None]:
students_dropout_and_academic_success = pd.read_csv("../../data/predict-students-dropout-and-academic-success/data.csv", delimiter=";")
students_dropout_and_academic_success

# Definition of domain knowledge

# Analysis and visualizations

In [None]:
pd.DataFrame({
    "Column Name": students_dropout_and_academic_success.columns,
    "Min value": students_dropout_and_academic_success.min(),
    "Max value": students_dropout_and_academic_success.max()
})

# Data preprocessing

## Remove missing values

## Map all booleans to their data type

## Map categorical values to string representations

## Show preprocessed data

In [None]:
students_dropout_and_academic_success

# Experiments

## Global parameters

In [None]:
seed = "your-studenid-or-any-other-seed"
test_size = 0.2
shuffle_train_test = True
drop_features = ["some-feature-that-should-not-be-used-for-training"]
scaler = StandardScaler()
scale_features = ["some-feature-that-should-be-scaled"]
category_encoder = OneHotEncoder()
category_features = ["some-categorical-feature"]

In [None]:
# ensure that dropped features are not encoded
scale_features = list(filter(lambda feature: feature not in drop_features, scale_features))
category_features = list(filter(lambda feature: feature not in drop_features, category_features))

## Metrics function

In [None]:
def calculate_performance_metrics(y_test, y_pred, class_names=None):        
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    
    return pd.Series({
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })
    
def calculate_per_class_accuracy(y_test, y_pred, class_names=None):
    if class_names is None:
        class_names = y_test.unique()
        
    present_classes = list(filter(lambda name: y_test.str.contains(name).any(), class_names))
    
    cm = confusion_matrix(y_test, y_pred)
    accuracies = np.divide(cm.diagonal(), cm.sum(axis=1))
    
    return pd.Series(accuracies[~np.isnan(accuracies)], index=present_classes)

## Data

### Split features and target value

In [None]:
X = students_dropout_and_academic_success.drop(drop_features, axis=1)
y = students_dropout_and_academic_success["your-target-feature"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, shuffle=shuffle_train_test)

### Additional preprocessing for training and evaluation

In [None]:
column_transformer = make_column_transformer(
    (scaler, scale_features),
    (category_encoder, category_features),
    remainder="passthrough"
)

X_train = pd.DataFrame(column_transformer.fit_transform(X_train), columns=column_transformer.get_feature_names_out())
X_test = pd.DataFrame(column_transformer.transform(X_test), columns=column_transformer.get_feature_names_out())

## Classifier 1

### Parameters

### Training

In [None]:
your_classifier = YourClassifier(model_specific_parameter=your_parameter, random_state=seed)
your_classifier.fit(X_train, y_train)

### Prediction

In [None]:
y_pred = your_classifier.predict(X_test)

### Evaluation

In [None]:
calculate_performance_metrics(y_test, y_pred)

In [None]:
calculate_per_class_accuracy(y_test, y_pred, class_names=a_list_of_target_classes)

## Classifier 2

### Parameters

### Training

In [None]:
your_classifier = YourClassifier(model_specific_parameter=your_parameter, random_state=seed)
your_classifier.fit(X_train, y_train)

### Prediction

In [None]:
y_pred = your_classifier.predict(X_test)

### Evaluation

In [None]:
calculate_performance_metrics(y_test, y_pred)

In [None]:
calculate_per_class_accuracy(y_test, y_pred, class_names=a_list_of_target_classes)

## Classifier 3

### Parameters

### Training

In [None]:
your_classifier = YourClassifier(model_specific_parameter=your_parameter, random_state=seed)
your_classifier.fit(X_train, y_train)

### Prediction

In [None]:
y_pred = your_classifier.predict(X_test)

### Evaluation

In [None]:
calculate_performance_metrics(y_test, y_pred)

In [None]:
calculate_per_class_accuracy(y_test, y_pred, class_names=a_list_of_target_classes)