In [29]:
%pip install -q pandas scipy scikit-learn

[0mNote: you may need to restart the kernel to use updated packages.


In [30]:
import json
import pandas as pd
from scipy.stats import ks_2samp, chi2_contingency
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from steps.clean import Cleaner
from steps.predict import Predictor

In [31]:
ref_raw = pd.read_csv("data/train.csv")
cur_raw = pd.read_csv("data/test.csv")

cleaner = Cleaner()
ref = cleaner.clean_data(ref_raw)
cur = cleaner.clean_data(cur_raw)

feature_cols = ref.columns[:-1]
target_col = "Rain"

In [33]:
acc = accuracy_score(cur[target_col], cur["prediction"])
roc = roc_auc_score(cur[target_col], cur["prediction"])
report = classification_report(cur[target_col], cur["prediction"], output_dict=True)
print(f"Accuracy: {acc:.3f}, ROC AUC: {roc:.3f}")
print(pd.DataFrame(report).T.head())

Accuracy: 0.701, ROC AUC: 0.689
              precision    recall  f1-score    support
0              0.708333  0.566667  0.629630  30.000000
1              0.697674  0.810811  0.750000  37.000000
accuracy       0.701493  0.701493  0.701493   0.701493
macro avg      0.703004  0.688739  0.689815  67.000000
weighted avg   0.702447  0.701493  0.696103  67.000000


In [35]:
numeric_cols = ["TN","TX","TAVG","RH_AVG","SS","FF_X","DDD_X","FF_AVG","Month","Day"]
cat_cols = ["DDD_CAR", "prediction"]  # sertakan prediksi untuk pantau distribusi output

drift_results = {}

for col in numeric_cols:
    stat, p = ks_2samp(ref[col], cur[col])
    drift_results[col] = {"test": "ks_2samp", "stat": stat, "pvalue": p}

for col in cat_cols:
    ref_counts = ref[col].value_counts()
    cur_counts = cur[col].value_counts()
    all_cats = sorted(set(ref_counts.index) | set(cur_counts.index))
    ref_freq = [ref_counts.get(c, 0) for c in all_cats]
    cur_freq = [cur_counts.get(c, 0) for c in all_cats]
    chi2, p, _, _ = chi2_contingency([ref_freq, cur_freq])
    drift_results[col] = {"test": "chi2", "stat": chi2, "pvalue": p}

summary = {
    "accuracy": acc,
    "roc_auc": roc,
    "classification_report": report,
    "drift": drift_results,
}

with open("production_drift.json", "w") as f:
    json.dump(summary, f, indent=2)

summary

{'accuracy': 0.7014925373134329,
 'roc_auc': 0.6887387387387387,
 'classification_report': {'0': {'precision': 0.7083333333333334,
   'recall': 0.5666666666666667,
   'f1-score': 0.6296296296296297,
   'support': 30.0},
  '1': {'precision': 0.6976744186046512,
   'recall': 0.8108108108108109,
   'f1-score': 0.75,
   'support': 37.0},
  'accuracy': 0.7014925373134329,
  'macro avg': {'precision': 0.7030038759689923,
   'recall': 0.6887387387387387,
   'f1-score': 0.6898148148148149,
   'support': 67.0},
  'weighted avg': {'precision': 0.7024470669906282,
   'recall': 0.7014925373134329,
   'f1-score': 0.6961028192371476,
   'support': 67.0}},
 'drift': {'TN': {'test': 'ks_2samp',
   'stat': np.float64(0.1466264184694505),
   'pvalue': np.float64(0.18124852787041437)},
  'TX': {'test': 'ks_2samp',
   'stat': np.float64(0.08558331935826485),
   'pvalue': np.float64(0.7865809397571607)},
  'TAVG': {'test': 'ks_2samp',
   'stat': np.float64(0.12817932807870758),
   'pvalue': np.float64(0.31