In [1]:
################################################################################
# Author 1:      Jakob Marktl
# MatNr 1:       12335939
# Author 2:      Christoph Nagy
# MatNr 2:       12331569
# Author 3:      Maria Mikic
# MatNr 3:       12234490
# File:          notebook.ipynb
# Description:   A simple baseline classifier that makes predictions based on a specified strategy.
# Comments:    ... comments for the tutors ...
#              ... can be multiline ...
################################################################################


In [9]:
%pip install -r "./assignment2/requirements.txt"

Note: you may need to restart the kernel to use updated packages.


In [3]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [4]:
import numpy as np
from pandas import DataFrame
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from assignment2.datasetClassifier import (
    DatasetHandler,
    DecisionTreeClassifier,
    GaussianNBClassifier,
    KNNClassifier,
    LogisticRegressionClassifier,
    RandomForestClassifierModel,
    SVMClassifier,
)
from assignment2.datasetPreProcessor import DatasetPreprocessor
from assignment2.graphing import Graphing
from assignment2.simpleBaselineClassifier import SimpleBaselineClassifier


In [5]:
dataset_path = "cleaned_dataset.csv"

preprocessor = DatasetPreprocessor("student+performance.zip")
preprocessor.to_csv(dataset_path)

df: DataFrame = preprocessor.data

df.describe()
df.value_counts()
df.info()

Saved cleaned CSV to cleaned_dataset.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   school      1044 non-null   int64
 1   sex         1044 non-null   int64
 2   age         1044 non-null   int64
 3   address     1044 non-null   int64
 4   famsize     1044 non-null   int64
 5   Pstatus     1044 non-null   int64
 6   Medu        1044 non-null   int64
 7   Fedu        1044 non-null   int64
 8   Mjob        1044 non-null   int64
 9   Fjob        1044 non-null   int64
 10  reason      1044 non-null   int64
 11  guardian    1044 non-null   int64
 12  traveltime  1044 non-null   int64
 13  studytime   1044 non-null   int64
 14  failures    1044 non-null   int64
 15  schoolsup   1044 non-null   int64
 16  famsup      1044 non-null   int64
 17  paid        1044 non-null   int64
 18  activities  1044 non-null   int64
 19  nursery     1044 non-null   

  df[col] = df[col].replace(mapping)
  df.replace({"yes": 1, "no": 0}, inplace=True)


In [6]:

classifiers = {
    "GaussianNB": GaussianNBClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNNClassifier(k=5),
    "RandomForest": RandomForestClassifierModel(n_estimators=100, random_state=0),
    "SVMClassifier": SVMClassifier(kernel="rbf", c=1.0),
    "LogisticRegression": LogisticRegressionClassifier(max_iter=5000, random_state=0),
    "SBC_most_frequent": SimpleBaselineClassifier("most_frequent"),
    "SBC_uniform": SimpleBaselineClassifier("uniform", random_state=3),
    "SBC_constant": SimpleBaselineClassifier("constant", constant=3)
}

dataset_handler = DatasetHandler(dataset_path)
graphing = Graphing(dataset_handler)

y_preds = {}
metrics = {
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": []
}

In [7]:

for name, clf in classifiers.items():
    clf.fit(dataset_handler.x_train, dataset_handler.y_train)
    y_pred = clf.predict(dataset_handler.x_test)
    y_preds[name] = y_pred

    metrics["Accuracy"].append(accuracy_score(dataset_handler.y_test, y_pred))
    metrics["Precision"].append(precision_score(dataset_handler.y_test, y_pred, average='macro', zero_division=0))
    metrics["Recall"].append(recall_score(dataset_handler.y_test, y_pred, average='macro'))
    metrics["F1 Score"].append(f1_score(dataset_handler.y_test, y_pred, average='macro'))

In [8]:
importances = np.sort(classifiers['RandomForest'].feature_importances)
graphing.print_feature_importances(importances)
graphing.plot_feature_importances(importances)


Feature ranking:
1. Feature G2 (0.3044)
2. Feature G1 (0.1714)
3. Feature absences (0.0460)
4. Feature health (0.0288)
5. Feature Walc (0.0270)
6. Feature Dalc (0.0260)
7. Feature goout (0.0247)
8. Feature freetime (0.0244)
9. Feature famrel (0.0236)
10. Feature romantic (0.0236)
11. Feature internet (0.0232)
12. Feature higher (0.0225)
13. Feature nursery (0.0222)
14. Feature activities (0.0218)
15. Feature paid (0.0212)
16. Feature famsup (0.0212)
17. Feature schoolsup (0.0171)
18. Feature failures (0.0157)
19. Feature studytime (0.0136)
20. Feature traveltime (0.0120)
21. Feature guardian (0.0111)
22. Feature reason (0.0111)
23. Feature Fjob (0.0109)
24. Feature Mjob (0.0101)
25. Feature Fedu (0.0099)
26. Feature Medu (0.0097)
27. Feature Pstatus (0.0095)
28. Feature famsize (0.0092)
29. Feature address (0.0076)
30. Feature age (0.0075)
31. Feature sex (0.0068)
32. Feature school (0.0062)


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
graphing.plot_feature_correspondence(["G1", "G2", "absences", "Walc", "age"])

In [None]:
for name in classifiers.keys():
    y_pred = y_preds[name]
    graphing.plot_confusion_matrix(dataset_handler.y_test, y_pred, name)

In [None]:
graphing.plot_evaluation_metrics(list(classifiers.keys()), metrics, title="Evaluation Metrics by Classifier with average=macro")

In [None]:
metrics_micro = {
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": []
}
for name in classifiers:
    y_pred = y_preds[name]

    metrics_micro["Accuracy"].append(accuracy_score(dataset_handler.y_test, y_pred))
    metrics_micro["Precision"].append(precision_score(dataset_handler.y_test, y_pred, average="micro"))
    metrics_micro["Recall"].append(recall_score(dataset_handler.y_test, y_pred, average="micro"))
    metrics_micro["F1 Score"].append(f1_score(dataset_handler.y_test, y_pred, average="micro"))
graphing.plot_evaluation_metrics(list(classifiers.keys()), metrics_micro, title="Evaluation Metrics by Classifier with average=micro")

In [None]:
metrics_weighted = {
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1 Score": []
}
for name in classifiers:
    y_pred = y_preds[name]

    metrics_weighted["Accuracy"].append(accuracy_score(dataset_handler.y_test, y_pred))
    metrics_weighted["Precision"].append(precision_score(dataset_handler.y_test, y_pred, average="weighted", zero_division=0))
    metrics_weighted["Recall"].append(recall_score(dataset_handler.y_test, y_pred, average="weighted"))
    metrics_weighted["F1 Score"].append(f1_score(dataset_handler.y_test, y_pred, average="weighted"))
graphing.plot_evaluation_metrics(list(classifiers.keys()), metrics_weighted, title="Evaluation Metrics by Classifier with average=weighted")