In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [2]:
student_performance = pd.read_csv('./datasets/student-por.csv', sep=';')

X = student_performance.drop(['higher'], axis=1)
y = student_performance['higher']
# .map({'Yes': 1, 'No': 0})
y.value_counts()

higher
yes    580
no      69
Name: count, dtype: int64

In [3]:
label_encoder = LabelEncoder()

columns_to_encode = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'schoolsup', 'famsup', 'paid', 
    'activities', 'nursery', 'internet', 'romantic', 'Mjob', 'Fjob', 'reason', 'guardian']
for column in columns_to_encode:
    X[column] = label_encoder.fit_transform(X[column])

In [4]:
X = pd.get_dummies(X, columns=['Mjob', 'Fjob', 'reason', 'guardian'], dtype='int') 

In [5]:
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def precision(y_true, y_pred):
    tp = np.sum((y_pred == 1) & (y_true == 1))
    fp = np.sum((y_pred == 1) & (y_true == 0))
    return tp / (tp + fp) if (tp + fp) > 0 else 0

def recall(y_true, y_pred):
    tp = np.sum((y_pred == 1) & (y_true == 1))
    fn = np.sum((y_pred == 0) & (y_true == 1))
    return tp / (tp + fn) if (tp + fn) > 0 else 0

def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * (p * r) / (p + r) if (p + r) > 0 else 0

def specificity(y_true, y_pred):
    tn = np.sum((y_pred == 0) & (y_true == 0))
    fp = np.sum((y_pred == 1) & (y_true == 0))
    return tn / (tn + fp) if (tn + fp) > 0 else 0

def roc_auc(y_true, y_prob):
    sorted_indices = np.argsort(y_prob)[::-1]
    y_true = np.array(y_true)[sorted_indices]
    
    tpr = np.cumsum(y_true) / np.sum(y_true)
    fpr = np.cumsum(1 - y_true) / np.sum(1 - y_true)
    
    auc = np.trapz(tpr, fpr)
    return auc

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
y_train = y_train.map({'yes': 1, 'no': 0})
y_test = y_test.map({'yes': 1, 'no': 0})

In [8]:
random = RandomForestClassifier(n_estimators=100, random_state=42)
random.fit(X_train, y_train)
y_pred = random.predict(X_test)

acc = accuracy(y_test, y_pred)
prec = precision(y_test, y_pred)
rec = recall(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
spec = specificity(y_test, y_pred)
# roc = roc_auc(y_test, y_pred)
print(f"Accuracy: {acc:.4f} \n Precision: {prec:.4f} \n Recall: {rec:.4f} \n F1: {f1:.4f} \n Specificity: {spec:.4f}")

Accuracy: 0.8974 
 Precision: 0.9062 
 Recall: 0.9886 
 F1: 0.9457 
 Specificity: 0.0526


In [9]:
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut

def k_fold_cv(model, X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    metrics = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        acc = accuracy(y_val, y_pred)
    
    return acc

def stratified_k_fold_cv(model, X, y, k=5):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    metrics = []
    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        acc = accuracy(y_val, y_pred)
    
    return acc

def loo_cv(model, X, y):
    loo = LeaveOneOut()
    metrics = []
    for train_index, val_index in loo.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        acc = accuracy(y_val, y_pred)
    
    return acc

In [10]:
print("Logistic Regression")
model = LogisticRegression(max_iter=10000)

print("\nK-Fold Cross Validation:")
a = k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nStratified K-Fold Cross Validation:")
stratified_metrics = stratified_k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nLeave-One-Out Cross Validation:")
loo_metrics = loo_cv(model, X, y)
print(f"Average Accuracy: {a:.4f}")

Logistic Regression

K-Fold Cross Validation:
Average Accuracy: 0.9380

Stratified K-Fold Cross Validation:
Average Accuracy: 0.9380

Leave-One-Out Cross Validation:
Average Accuracy: 0.9380


In [11]:
print("Decision Tree")
model = DecisionTreeClassifier()

print("\nK-Fold Cross Validation:")
a = k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nStratified K-Fold Cross Validation:")
stratified_metrics = stratified_k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nLeave-One-Out Cross Validation:")
loo_metrics = loo_cv(model, X, y)
print(f"Average Accuracy: {a:.4f}")

Decision Tree

K-Fold Cross Validation:
Average Accuracy: 0.8527

Stratified K-Fold Cross Validation:
Average Accuracy: 0.8527

Leave-One-Out Cross Validation:
Average Accuracy: 0.8527


In [12]:
print("SVM")
model = SVC(probability=True)

print("\nK-Fold Cross Validation:")
a = k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nStratified K-Fold Cross Validation:")
stratified_metrics = stratified_k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nLeave-One-Out Cross Validation:")
loo_metrics = loo_cv(model, X, y)
print(f"Average Accuracy: {a:.4f}")

SVM

K-Fold Cross Validation:
Average Accuracy: 0.9302

Stratified K-Fold Cross Validation:
Average Accuracy: 0.9302

Leave-One-Out Cross Validation:
Average Accuracy: 0.9302


In [13]:
print("Random Forest")
model = RandomForestClassifier()

print("\nK-Fold Cross Validation:")
a = k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nStratified K-Fold Cross Validation:")
stratified_metrics = stratified_k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nLeave-One-Out Cross Validation:")
loo_metrics = loo_cv(model, X, y)
print(f"Average Accuracy: {a:.4f}")

Random Forest

K-Fold Cross Validation:
Average Accuracy: 0.9380

Stratified K-Fold Cross Validation:
Average Accuracy: 0.9380

Leave-One-Out Cross Validation:
Average Accuracy: 0.9380


In [14]:
print("XGBoost")
model = XGBClassifier(eval_metric="logloss")

y = y.map({'yes': 1, 'no': 0})

print("\nK-Fold Cross Validation:")
a = k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nStratified K-Fold Cross Validation:")
stratified_metrics = stratified_k_fold_cv(model, X, y, k=5)
print(f"Average Accuracy: {a:.4f}")

print("\nLeave-One-Out Cross Validation:")
loo_metrics = loo_cv(model, X, y)
print(f"Average Accuracy: {a:.4f}")

XGBoost

K-Fold Cross Validation:
Average Accuracy: 0.9380

Stratified K-Fold Cross Validation:
Average Accuracy: 0.9380

Leave-One-Out Cross Validation:
Average Accuracy: 0.9380
