In [None]:
################################################################################
# Author 1:      Jakob Marktl
# MatNr 1:       12335939
# Author 2:      Christoph Nagy
# MatNr 2:       12331569
# Author 3:      Maria Mikic
# MatNr 3:       12234490
# File:          notebook.ipynb
# Description:   A simple baseline classifier that makes predictions based on a specified strategy.
# Comments:    ... comments for the tutors ...
#              ... can be multiline ...
################################################################################


In [None]:
%pip install -r "./assignment2/requirements.txt"

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [None]:
import numpy as np
from pandas import DataFrame
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from assignment2.datasetClassifier import (
    DatasetHandler,
    DecisionTreeClassifier,
    GaussianNBClassifier,
    KNNClassifier,
    LogisticRegressionClassifier,
    RandomForestClassifierModel,
    SVMClassifier,
)
from assignment2.datasetPreProcessor import DatasetPreprocessor
from assignment2.graphing import Graphing
from assignment2.simpleBaselineClassifier import SimpleBaselineClassifier


In [None]:
dataset_path = "cleaned_dataset.csv"

preprocessor = DatasetPreprocessor("student+performance.zip")
preprocessor.to_csv(dataset_path)

df: DataFrame = preprocessor.data

df.describe()
df.value_counts()
df.info()

In [None]:

classifiers = {
    "GaussianNB": GaussianNBClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNNClassifier(k=5),
    "RandomForest": RandomForestClassifierModel(n_estimators=100, random_state=0),
    "SVMClassifier": SVMClassifier(kernel="rbf", c=1.0),
    "LogisticRegression": LogisticRegressionClassifier(max_iter=5000, random_state=0),
    "SBC_most_frequent": SimpleBaselineClassifier("most_frequent"),
    "SBC_uniform": SimpleBaselineClassifier("uniform", random_state=3),
    "SBC_constant": SimpleBaselineClassifier("constant", constant=3)
}

dataset_handler = DatasetHandler(dataset_path)
graphing = Graphing(dataset_handler)

y_preds = {}
metrics = {
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": []
}

In [None]:

for name, clf in classifiers.items():
    clf.fit(dataset_handler.x_train, dataset_handler.y_train)
    y_pred = clf.predict(dataset_handler.x_test)
    y_preds[name] = y_pred

    metrics["Accuracy"].append(accuracy_score(dataset_handler.y_test, y_pred))
    metrics["Precision"].append(precision_score(dataset_handler.y_test, y_pred, average='macro', zero_division=0))
    metrics["Recall"].append(recall_score(dataset_handler.y_test, y_pred, average='macro'))
    metrics["F1 Score"].append(f1_score(dataset_handler.y_test, y_pred, average='macro'))

In [None]:
importances = np.sort(classifiers['RandomForest'].feature_importances)
graphing.print_feature_importances(importances)
graphing.plot_feature_importances(importances)


In [None]:
graphing.plot_feature_correspondence(["G1", "G2", "absences", "Walc", "age"])

In [None]:
for name in classifiers.keys():
    y_pred = y_preds[name]
    graphing.plot_confusion_matrix(dataset_handler.y_test, y_pred, name)

In [None]:
graphing.plot_evaluation_metrics(list(classifiers.keys()), metrics, title="Evaluation Metrics by Classifier with average=macro")

In [None]:
metrics_micro = {
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": []
}
for name in classifiers:
    y_pred = y_preds[name]

    metrics_micro["Accuracy"].append(accuracy_score(dataset_handler.y_test, y_pred))
    metrics_micro["Precision"].append(precision_score(dataset_handler.y_test, y_pred, average="micro"))
    metrics_micro["Recall"].append(recall_score(dataset_handler.y_test, y_pred, average="micro"))
    metrics_micro["F1 Score"].append(f1_score(dataset_handler.y_test, y_pred, average="micro"))
graphing.plot_evaluation_metrics(list(classifiers.keys()), metrics_micro, title="Evaluation Metrics by Classifier with average=micro")

In [None]:
metrics_weighted = {
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": []
}
for name in classifiers:
    y_pred = y_preds[name]

    metrics_weighted["Accuracy"].append(accuracy_score(dataset_handler.y_test, y_pred))
    metrics_weighted["Precision"].append(precision_score(dataset_handler.y_test, y_pred, average="weighted", zero_division=0))
    metrics_weighted["Recall"].append(recall_score(dataset_handler.y_test, y_pred, average="weighted"))
    metrics_weighted["F1 Score"].append(f1_score(dataset_handler.y_test, y_pred, average="weighted"))
graphing.plot_evaluation_metrics(list(classifiers.keys()), metrics_weighted, title="Evaluation Metrics by Classifier with average=weighted")