# 5-Fold Stratified CV

In [14]:
import numpy as np
import pandas as pd

In [15]:
# — Mulai block pembuatan & preprocessing data dummy —
np.random.seed(42)
n = 100
data = {'Enrolled': np.random.randint(300, 1200, n)}
data['Present'] = [int(e * np.random.uniform(0.7, 0.98)) for e in data['Enrolled']]
data['Absent']  = [int(e * np.random.uniform(0.01, 0.2)) for e in data['Enrolled']]
for i in range(n):
    if data['Present'][i] + data['Absent'][i] > data['Enrolled'][i]:
        data['Absent'][i] = data['Enrolled'][i] - data['Present'][i]
data['Released_Early'] = [
    data['Enrolled'][i] - data['Present'][i] - data['Absent'][i] for i in range(n)
]
df = pd.DataFrame(data)


In [16]:
# Feature engineering & labeling
df['Attendance_Rate']       = df['Present'] / df['Enrolled'] * 100
df['Absence_Rate']          = df['Absent']   / df['Enrolled'] * 100
df['Early_Release_Rate']    = df['Released_Early'] / df['Enrolled'] * 100
df['Attendance_Label']      = df['Attendance_Rate'].apply(lambda x: 'High' if x>=90 else 'Low')
# — Selesai block —

In [17]:
from sklearn.model_selection import StratifiedKFold
# ... dll

X = df[['Attendance_Rate', 'Absence_Rate', 'Early_Release_Rate']]
y = df['Attendance_Label'].map({'High':1,'Low':0})

In [22]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_accuracy = []
cv_precision = []
cv_recall = []
cv_f1 = []

In [19]:
fold = 1
for train_index, test_index in skf.split(X, y):
    # Split data
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Standarisasi
    scaler = StandardScaler()
    X_train_fold_scaled = scaler.fit_transform(X_train_fold)
    X_test_fold_scaled = scaler.transform(X_test_fold)
    
    # Training
    model = GaussianNB()
    model.fit(X_train_fold_scaled, y_train_fold)
    
    # Prediksi
    y_pred_fold = model.predict(X_test_fold_scaled)
    
    # Evaluasi
    acc = accuracy_score(y_test_fold, y_pred_fold)
    prec = precision_score(y_test_fold, y_pred_fold)
    rec = recall_score(y_test_fold, y_pred_fold)
    f1 = f1_score(y_test_fold, y_pred_fold)
    
    # Simpan
    cv_accuracy.append(acc)
    cv_precision.append(prec)
    cv_recall.append(rec)
    cv_f1.append(f1)
    
    print(f"Fold {fold}:")
    print(f"  Accuracy : {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall   : {rec:.4f}")
    print(f"  F1-Score : {f1:.4f}\\n")
    
    fold += 1


Fold 1:
  Accuracy : 0.9500
  Precision: 0.8333
  Recall   : 1.0000
  F1-Score : 0.9091\n
Fold 2:
  Accuracy : 1.0000
  Precision: 1.0000
  Recall   : 1.0000
  F1-Score : 1.0000\n
Fold 3:
  Accuracy : 1.0000
  Precision: 1.0000
  Recall   : 1.0000
  F1-Score : 1.0000\n
Fold 4:
  Accuracy : 0.9000
  Precision: 0.8333
  Recall   : 0.8333
  F1-Score : 0.8333\n
Fold 5:
  Accuracy : 1.0000
  Precision: 1.0000
  Recall   : 1.0000
  F1-Score : 1.0000\n


In [21]:
print("Rata-Rata 5-Fold CV:")
print(f"Accuracy : {np.mean(cv_accuracy):.4f}")
print(f"Precision: {np.mean(cv_precision):.4f}")
print(f"Recall   : {np.mean(cv_recall):.4f}")
print(f"F1-Score : {np.mean(cv_f1):.4f}")

Rata-Rata 5-Fold CV:
Accuracy : 0.9700
Precision: 0.9333
Recall   : 0.9667
F1-Score : 0.9485


> Hasil 5-Fold Cross Validation menunjukkan bahwa model Gaussian Naïve Bayes memiliki performa yang **stabil** dan **konsisten** di seluruh fold.
> Ini memberikan keyakinan bahwa model tidak overfit terhadap subset tertentu.