# Classical Model Benchmark – Flood Classification

Compares 10 scikit‑learn classifiers on:
1. **Combined dataset** (all three stations merged)
2. **Per‑station datasets** (D08A071, D08A084, D08A115)

Splits are chronological 80 / 20 to avoid information leakage.
Metrics reported: Accuracy, Precision, Recall, PR‑AUC, Confusion Matrix.

In [1]:
# Auto‑install packages used in the benchmark
import importlib, subprocess, sys
for pkg in ['xgboost', 'imbalanced-learn']:
    if importlib.util.find_spec(pkg) is None:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])


In [2]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, average_precision_score, precision_recall_curve)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier


In [3]:
# ── CONFIG ────────────────────────────────────────────────────────────
PROJECT_ROOT = Path.cwd() if (Path.cwd()/'data').exists() else Path.cwd().parent
DATA_RAW = PROJECT_ROOT/'data'/'raw'
STATIONS = ['D08A071','D08A084','D08A115']
ROLL_WINDOWS = [3,6,12]
LAG_HRS = range(1,13)
API_WINDOW = 24*7
PERCENTILE = 0.82
TEST_FRAC = 0.2


In [4]:
def find_csv(code):
    p = DATA_RAW / f'station_{code}'
    files = list(p.glob('*.csv'))
    assert len(files)==1, f'{code}: CSV not found'
    return files[0]

def load_station(code):
    df = pd.read_csv(find_csv(code))
    df['datetime'] = pd.to_datetime(df['saatlik'])
    df = (df.rename(columns={'yagis_toplam':'rain_mm','qdeger':'discharge_cms'})
            .set_index('datetime').sort_index()
            .resample('h').agg({'rain_mm':'sum','discharge_cms':'mean'}))
    return df

def build_features(df, percentile=PERCENTILE):
    st = df.copy()
    # rainfall sums
    for w in ROLL_WINDOWS:
        st[f'rain_sum_{w}h'] = st['rain_mm'].rolling(w, min_periods=1).sum()
    # discharge lags
    for l in LAG_HRS:
        st[f'dis_lag_{l}h'] = st['discharge_cms'].shift(l)
    # rate of change
    st['dis_rate_1h'] = st['discharge_cms'].diff(1)
    st['dis_rate_3h'] = st['discharge_cms'].diff(3)
    # API7
    st['API7'] = st['rain_mm'].rolling(API_WINDOW, min_periods=1).sum()
    st.dropna(inplace=True)
    thr = st['discharge_cms'].quantile(percentile)
    st['flood'] = (st['discharge_cms'] > thr).astype(int)
    return st


In [5]:
frames = []
for code in STATIONS:
    df = build_features(load_station(code))
    df['station'] = code
    frames.append(df)
combined = pd.concat(frames)
feature_cols = [c for c in combined.columns if c.startswith(('rain_sum','dis_lag','dis_rate','API'))]
X_comb = combined[feature_cols]
y_comb = combined['flood']
print('Combined set:', combined.shape, '  Positives:', y_comb.sum())


Combined set: (79459, 22)   Positives: 11479


In [6]:
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42, class_weight='balanced'),
    "Naive Bayes": GaussianNB(),
    "LDA": LinearDiscriminantAnalysis(),
    "MLP": MLPClassifier(max_iter=800, random_state=42)
}


In [7]:
def evaluate_models(X_train, X_test, y_train, y_test, clf_dict):
    rows = []
    for name, clf in clf_dict.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        try:
            y_prob = clf.predict_proba(X_test)[:,1]
        except AttributeError:
            y_prob = clf.decision_function(X_test)
        if y_pred.sum() == 0:
            prec = 0.0
            rec = 0.0
            note ="NoPoS"
        else:
            prec = precision_score(y_test, y_pred, zero_division=0)
            rec  = recall_score(y_test, y_pred, zero_division=0)
            note = ""
        rows.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': prec,
            'Recall': rec,
            'PR-AUC': average_precision_score(y_test, y_prob),
            'ConfMatrix': confusion_matrix(y_test, y_pred),
            'Note': note
        })
    return pd.DataFrame(rows).set_index('Model').sort_values('PR-AUC', ascending=False)


## Combined dataset – chronological 80/20 split

In [8]:
split_idx = int(len(X_comb)*(1-TEST_FRAC))
X_train_c, X_test_c = X_comb.iloc[:split_idx], X_comb.iloc[split_idx:]
y_train_c, y_test_c = y_comb.iloc[:split_idx], y_comb.iloc[split_idx:]
combined_metrics = evaluate_models(X_train_c, X_test_c, y_train_c, y_test_c, classifiers)
combined_metrics


Unnamed: 0_level_0,Accuracy,Precision,Recall,PR-AUC,ConfMatrix,Note
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AdaBoost,0.972942,0.898697,0.958652,0.981192,"[[12703, 311], [119, 2759]]",
Gradient Boosting,0.904921,0.665858,0.95344,0.976371,"[[11637, 1377], [134, 2744]]",
Logistic Regression,0.919645,0.703018,0.963169,0.969166,"[[11843, 1171], [106, 2772]]",
Naive Bayes,0.980745,0.953136,0.939889,0.964582,"[[12881, 133], [173, 2705]]",
MLP,0.895734,0.650704,0.915914,0.948404,"[[11599, 1415], [242, 2636]]",
SVM,0.940159,0.766528,0.962821,0.941781,"[[12170, 844], [107, 2771]]",
LDA,0.892147,0.836028,0.503127,0.843169,"[[12730, 284], [1430, 1448]]",
Random Forest,0.897621,0.647698,0.953092,0.675944,"[[11522, 1492], [135, 2743]]",
KNN,0.894349,0.651122,0.897498,0.670817,"[[11630, 1384], [295, 2583]]",
Decision Tree,0.890951,0.631518,0.955177,0.611329,"[[11410, 1604], [129, 2749]]",


In [9]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from collections import Counter

# 1️⃣ Class-imbalance
counts = Counter(y_comb)
flood_dist = {k: f"{v}  ({v/len(y_comb):.2%})" for k, v in counts.items()}
print("Flood distribution (combined):", flood_dist)

# 2️⃣ Chronological CV on the Random-Forest baseline
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
tscv = TimeSeriesSplit(n_splits=5)

cv_scores = cross_val_score(rf, X_comb, y_comb, cv=tscv, scoring='accuracy')
print("RF TimeSeries CV accuracy  mean±std:", f"{cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

Flood distribution (combined): {0: '67980  (85.55%)', 1: '11479  (14.45%)'}
RF TimeSeries CV accuracy  mean±std: 0.894 ± 0.071


## Per‑station benchmarks

In [10]:
station_tables = {}
for code in STATIONS:
    df = combined[combined['station']==code]
    X = df[feature_cols]; y = df['flood']
    split = int(len(X)*(1-TEST_FRAC))
    X_tr, X_te = X.iloc[:split], X.iloc[split:]
    y_tr, y_te = y.iloc[:split], y.iloc[split:]
    station_tables[code] = evaluate_models(X_tr, X_te, y_tr, y_te, classifiers)

for code, tbl in station_tables.items():
    print(f"\n### Station {code}")
    display(tbl)



### Station D08A071


Unnamed: 0_level_0,Accuracy,Precision,Recall,PR-AUC,ConfMatrix,Note
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Random Forest,0.999804,0.999469,1.0,1.0,"[[3227, 1], [0, 1883]]",
Logistic Regression,1.0,1.0,1.0,1.0,"[[3228, 0], [0, 1883]]",
MLP,1.0,1.0,1.0,1.0,"[[3228, 0], [0, 1883]]",
AdaBoost,0.999804,0.999469,1.0,0.999998,"[[3227, 1], [0, 1883]]",
Gradient Boosting,0.999804,0.999469,1.0,0.999987,"[[3227, 1], [0, 1883]]",
SVM,0.999217,0.99788,1.0,0.999793,"[[3224, 4], [0, 1883]]",
Decision Tree,0.999804,0.999469,1.0,0.999469,"[[3227, 1], [0, 1883]]",
KNN,0.998043,0.998403,0.996283,0.998386,"[[3225, 3], [7, 1876]]",
LDA,0.631579,0.0,0.0,0.990743,"[[3228, 0], [1883, 0]]",NoPoS
Naive Bayes,0.84054,0.99262,0.571429,0.990326,"[[3220, 8], [807, 1076]]",



### Station D08A084


Unnamed: 0_level_0,Accuracy,Precision,Recall,PR-AUC,ConfMatrix,Note
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Gradient Boosting,1.0,1.0,1.0,1.0,"[[5870, 0], [0, 524]]",
Random Forest,1.0,1.0,1.0,1.0,"[[5870, 0], [0, 524]]",
AdaBoost,1.0,1.0,1.0,1.0,"[[5870, 0], [0, 524]]",
Decision Tree,0.999844,1.0,0.998092,0.998248,"[[5870, 0], [1, 523]]",
MLP,0.981232,0.814642,0.998092,0.997301,"[[5751, 119], [1, 523]]",
Logistic Regression,0.098686,0.083347,1.0,0.9816,"[[107, 5763], [0, 524]]",
Naive Bayes,0.985455,0.849271,1.0,0.895726,"[[5777, 93], [0, 524]]",
LDA,0.943854,0.70073,0.549618,0.774826,"[[5747, 123], [236, 288]]",
SVM,0.906162,0.457871,0.788168,0.621649,"[[5381, 489], [111, 413]]",
KNN,0.893025,0.39899,0.603053,0.355722,"[[5394, 476], [208, 316]]",



### Station D08A115


Unnamed: 0_level_0,Accuracy,Precision,Recall,PR-AUC,ConfMatrix,Note
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Random Forest,0.998861,0.971963,0.981132,0.998948,"[[4280, 3], [2, 104]]",
Gradient Boosting,0.999316,0.972477,1.0,0.998576,"[[4280, 3], [0, 106]]",
AdaBoost,0.999316,0.972477,1.0,0.998116,"[[4280, 3], [0, 106]]",
MLP,0.998633,0.962963,0.981132,0.996566,"[[4279, 4], [2, 104]]",
Logistic Regression,0.998861,0.971963,0.981132,0.973176,"[[4280, 3], [2, 104]]",
Decision Tree,0.999089,0.972222,0.990566,0.963278,"[[4280, 3], [1, 105]]",
SVM,0.987013,0.660131,0.95283,0.93634,"[[4231, 52], [5, 101]]",
LDA,0.982684,0.916667,0.311321,0.81383,"[[4280, 3], [73, 33]]",
Naive Bayes,0.982684,0.582418,1.0,0.679487,"[[4207, 76], [0, 106]]",
KNN,0.97995,0.576271,0.641509,0.587206,"[[4233, 50], [38, 68]]",
