In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

# Load
df = pd.read_csv("data/data.csv", sep=";")

# Target encode
le = LabelEncoder()
y = le.fit_transform(df["Target"])
class_names = list(le.classes_)

# Features (no preprocessing/engineering, as requested)
X = df.drop(columns=["Target"])

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

In [10]:
def run_and_report(name, model, show_per_class=True):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.4f}")
    if show_per_class:
        print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0))
    else:
        # just weighted/macro to keep output clean
        print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    return acc

In [11]:
# 1) Dumb baseline: always predict majority class
acc_dummy = run_and_report(
    "Dummy (most_frequent)",
    DummyClassifier(strategy="most_frequent", random_state=42)
)

# 2) Random Forest baseline
acc_rf = run_and_report(
    "RandomForest (n_estimators=200, max_depth=None)",
    RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
)

# 3) Basic linear baseline: Multinomial Logistic Regression
#    (Works fine without scaling on this dataset size/feature count)
acc_logreg = run_and_report(
    "LogisticRegression (multinomial, lbfgs)",
    LogisticRegression(solver="lbfgs", max_iter=2000, n_jobs=-1)
)

print("\nSummary accuracies:")
print({
    "Dummy": acc_dummy,
    "RandomForest": acc_rf,
    "LogReg": acc_logreg
})



=== Dummy (most_frequent) ===
Accuracy: 0.4994
              precision    recall  f1-score   support

     Dropout       0.00      0.00      0.00       284
    Enrolled       0.00      0.00      0.00       159
    Graduate       0.50      1.00      0.67       442

    accuracy                           0.50       885
   macro avg       0.17      0.33      0.22       885
weighted avg       0.25      0.50      0.33       885

Confusion matrix:
 [[  0   0 284]
 [  0   0 159]
 [  0   0 442]]

=== RandomForest (n_estimators=200, max_depth=None) ===
Accuracy: 0.7650
              precision    recall  f1-score   support

     Dropout       0.80      0.75      0.77       284
    Enrolled       0.55      0.34      0.42       159
    Graduate       0.79      0.93      0.85       442

    accuracy                           0.76       885
   macro avg       0.71      0.67      0.68       885
weighted avg       0.75      0.76      0.75       885

Confusion matrix:
 [[213  24  47]
 [ 42  54  63]
 [