# Classical Model Benchmark – Flood Classification
*Generated 2025-08-07 00:08*

Compares 10 scikit‑learn classifiers on:
1. **Combined dataset** (all three stations merged)
2. **Per‑station datasets** (D08A071, D08A084, D08A115)

Splits are chronological 80 / 20 to avoid information leakage.
Metrics reported: Accuracy, Precision, Recall, PR‑AUC, Confusion Matrix.

In [1]:
# Auto‑install packages used in the benchmark
import importlib, subprocess, sys
for pkg in ['xgboost', 'imbalanced-learn']:
    if importlib.util.find_spec(pkg) is None:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])


In [2]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, average_precision_score, precision_recall_curve)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier


In [3]:
# ── CONFIG ────────────────────────────────────────────────────────────
PROJECT_ROOT = Path.cwd() if (Path.cwd()/'data').exists() else Path.cwd().parent
DATA_RAW = PROJECT_ROOT/'data'/'raw'
STATIONS = ['D08A071','D08A084','D08A115']
ROLL_WINDOWS = [3,6,12]
LAG_HRS = range(1,13)
API_WINDOW = 24*7
PERCENTILE = 0.92
TEST_FRAC = 0.2


In [4]:
def find_csv(code):
    p = DATA_RAW / f'station_{code}'
    files = list(p.glob('*.csv'))
    assert len(files)==1, f'{code}: CSV not found'
    return files[0]

def load_station(code):
    df = pd.read_csv(find_csv(code))
    df['datetime'] = pd.to_datetime(df['saatlik'])
    df = (df.rename(columns={'yagis_toplam':'rain_mm','qdeger':'discharge_cms'})
            .set_index('datetime').sort_index()
            .resample('h').agg({'rain_mm':'sum','discharge_cms':'mean'}))
    return df

def build_features(df, percentile=PERCENTILE):
    st = df.copy()
    # rainfall sums
    for w in ROLL_WINDOWS:
        st[f'rain_sum_{w}h'] = st['rain_mm'].rolling(w, min_periods=1).sum()
    # discharge lags
    for l in LAG_HRS:
        st[f'dis_lag_{l}h'] = st['discharge_cms'].shift(l)
    # rate of change
    st['dis_rate_1h'] = st['discharge_cms'].diff(1)
    st['dis_rate_3h'] = st['discharge_cms'].diff(3)
    # API7
    st['API7'] = st['rain_mm'].rolling(API_WINDOW, min_periods=1).sum()
    st.dropna(inplace=True)
    thr = st['discharge_cms'].quantile(percentile)
    st['flood'] = (st['discharge_cms'] > thr).astype(int)
    return st


In [5]:
frames = []
for code in STATIONS:
    df = build_features(load_station(code))
    df['station'] = code
    frames.append(df)
combined = pd.concat(frames)
feature_cols = [c for c in combined.columns if c.startswith(('rain_sum','dis_lag','dis_rate','API'))]
X_comb = combined[feature_cols]
y_comb = combined['flood']
print('Combined set:', combined.shape, '  Positives:', y_comb.sum())


Combined set: (79459, 22)   Positives: 6075


In [6]:
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42, class_weight='balanced'),
    "Naive Bayes": GaussianNB(),
    "LDA": LinearDiscriminantAnalysis(),
    "MLP": MLPClassifier(max_iter=800, random_state=42)
}


In [7]:
def evaluate_models(X_train, X_test, y_train, y_test, clf_dict):
    rows = []
    for name, clf in clf_dict.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        try:
            y_prob = clf.predict_proba(X_test)[:,1]
        except AttributeError:
            y_prob = clf.decision_function(X_test)
        rows.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'PR-AUC': average_precision_score(y_test, y_prob),
            'ConfMatrix': confusion_matrix(y_test, y_pred)
        })
    return pd.DataFrame(rows).set_index('Model').sort_values('PR-AUC', ascending=False)


## Combined dataset – chronological 80/20 split

In [8]:
split_idx = int(len(X_comb)*(1-TEST_FRAC))
X_train_c, X_test_c = X_comb.iloc[:split_idx], X_comb.iloc[split_idx:]
y_train_c, y_test_c = y_comb.iloc[:split_idx], y_comb.iloc[split_idx:]
combined_metrics = evaluate_models(X_train_c, X_test_c, y_train_c, y_test_c, classifiers)
combined_metrics


Unnamed: 0_level_0,Accuracy,Precision,Recall,PR-AUC,ConfMatrix
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MLP,0.916499,0.562126,0.994709,0.982505,"[[12873, 1318], [9, 1692]]"
SVM,0.879373,0.470149,1.0,0.979745,"[[12274, 1917], [0, 1701]]"
AdaBoost,0.9574,0.71567,0.998824,0.963371,"[[13516, 675], [2, 1699]]"
Logistic Regression,0.867166,0.446222,1.0,0.893999,"[[12080, 2111], [0, 1701]]"
Gradient Boosting,0.909955,0.543103,1.0,0.867944,"[[12760, 1431], [0, 1701]]"
Naive Bayes,0.943619,0.654987,1.0,0.747692,"[[13295, 896], [0, 1701]]"
LDA,0.925057,0.663462,0.608466,0.730677,"[[13666, 525], [666, 1035]]"
Random Forest,0.875787,0.462796,0.998236,0.699461,"[[12220, 1971], [3, 1698]]"
KNN,0.886987,0.485558,0.938859,0.521173,"[[12499, 1692], [104, 1597]]"
Decision Tree,0.870186,0.451656,0.994121,0.44963,"[[12138, 2053], [10, 1691]]"


In [9]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from collections import Counter

# 1️⃣ Class-imbalance
counts = Counter(y_comb)
flood_dist = {k: f"{v}  ({v/len(y_comb):.2%})" for k, v in counts.items()}
print("Flood distribution (combined):", flood_dist)

# 2️⃣ Chronological CV on the Random-Forest baseline
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
tscv = TimeSeriesSplit(n_splits=5)

cv_scores = cross_val_score(rf, X_comb, y_comb, cv=tscv, scoring='accuracy')
print("RF TimeSeries CV accuracy  mean±std:", f"{cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

Flood distribution (combined): {0: '73384  (92.35%)', 1: '6075  (7.65%)'}
RF TimeSeries CV accuracy  mean±std: 0.949 ± 0.050


## Per‑station benchmarks

In [10]:
station_tables = {}
for code in STATIONS:
    df = combined[combined['station']==code]
    X = df[feature_cols]; y = df['flood']
    split = int(len(X)*(1-TEST_FRAC))
    X_tr, X_te = X.iloc[:split], X.iloc[split:]
    y_tr, y_te = y.iloc[:split], y.iloc[split:]
    station_tables[code] = evaluate_models(X_tr, X_te, y_tr, y_te, classifiers)

for code, tbl in station_tables.items():
    print(f"\n### Station {code}")
    display(tbl)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



### Station D08A071


Unnamed: 0_level_0,Accuracy,Precision,Recall,PR-AUC,ConfMatrix
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Logistic Regression,0.89865,0.489655,1.0,1.0,"[[4096, 518], [0, 497]]"
AdaBoost,0.999413,0.994,1.0,0.999988,"[[4611, 3], [0, 497]]"
Gradient Boosting,0.999804,0.997992,1.0,0.999854,"[[4613, 1], [0, 497]]"
Random Forest,0.999413,0.994,1.0,0.999837,"[[4611, 3], [0, 497]]"
Decision Tree,0.999804,0.997992,1.0,0.997992,"[[4613, 1], [0, 497]]"
SVM,0.895128,0.480712,0.977867,0.978181,"[[4089, 525], [11, 486]]"
MLP,0.960673,0.72561,0.957746,0.976876,"[[4434, 180], [21, 476]]"
Naive Bayes,0.993348,0.947776,0.985915,0.917759,"[[4587, 27], [7, 490]]"
KNN,0.986304,0.922772,0.937626,0.902135,"[[4575, 39], [31, 466]]"
LDA,0.902759,0.0,0.0,0.583537,"[[4614, 0], [497, 0]]"



### Station D08A084


Unnamed: 0_level_0,Accuracy,Precision,Recall,PR-AUC,ConfMatrix
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gradient Boosting,1.0,1.0,1.0,1.0,"[[5870, 0], [0, 524]]"
AdaBoost,0.999844,0.998095,1.0,1.0,"[[5869, 1], [0, 524]]"
Random Forest,0.999531,0.99619,0.998092,0.999993,"[[5868, 2], [1, 523]]"
MLP,0.998123,0.994208,0.982824,0.999522,"[[5867, 3], [9, 515]]"
Decision Tree,0.999844,0.998095,1.0,0.998095,"[[5869, 1], [0, 524]]"
Logistic Regression,0.987332,0.866116,1.0,0.977157,"[[5789, 81], [0, 524]]"
Naive Bayes,0.989678,0.888136,1.0,0.903971,"[[5804, 66], [0, 524]]"
LDA,0.944479,0.710723,0.543893,0.775768,"[[5754, 116], [239, 285]]"
SVM,0.904911,0.453744,0.78626,0.561419,"[[5374, 496], [112, 412]]"
KNN,0.91711,0.495208,0.591603,0.449297,"[[5554, 316], [214, 310]]"



### Station D08A115


Unnamed: 0_level_0,Accuracy,Precision,Recall,PR-AUC,ConfMatrix
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoost,0.999772,1.0,0.941176,1.0,"[[4372, 0], [1, 16]]"
MLP,1.0,1.0,1.0,1.0,"[[4372, 0], [0, 17]]"
Random Forest,0.999544,0.941176,0.941176,0.996732,"[[4371, 1], [1, 16]]"
Gradient Boosting,0.999772,1.0,0.941176,0.982843,"[[4372, 0], [1, 16]]"
Logistic Regression,0.999544,1.0,0.882353,0.980095,"[[4372, 0], [2, 15]]"
Decision Tree,0.999772,1.0,0.941176,0.941404,"[[4372, 0], [1, 16]]"
LDA,0.999089,1.0,0.764706,0.9392,"[[4372, 0], [4, 13]]"
SVM,0.997038,0.576923,0.882353,0.923529,"[[4361, 11], [2, 15]]"
KNN,0.997038,0.590909,0.764706,0.571269,"[[4363, 9], [4, 13]]"
Naive Bayes,0.987013,0.22973,1.0,0.303571,"[[4315, 57], [0, 17]]"
