In [30]:
import pandas as pd, numpy as np
import time
from pathlib import Path
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, average_precision_score, precision_recall_curve)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_is_fitted
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [31]:
# ── CONFIG ─────────────────────────────────────────────────────
PROJECT_ROOT = '../'
DATA_RAW = PROJECT_ROOT+'data/raw/'
STATIONS = ['D08A071','D08A084','D08A115']
ROLL_WINDOWS = [3,6,12]
LAG_HRS = range(1,13)
API_WINDOW = 24*7
PERCENTILE = 0.92
SEED = 42
TEST_FRAC = 0.30   # 70/30 split
MY_THR = {'D08A071':0.5, 'D08A084':0.5, 'D08A115':0.95}


In [32]:
def load_station(code):
    csv = DATA_RAW  + code + '.csv'
    df = pd.read_csv(csv)
    df['datetime'] = pd.to_datetime(df['saatlik'])
    df = (df.rename(columns={'yagis_toplam':'rain_mm','qdeger':'discharge_cms'})
            .set_index('datetime').sort_index()
            .resample('h').agg({'rain_mm':'sum','discharge_cms':'mean'}))
    return df

def build_features(df):
    st = df.copy()
    for w in ROLL_WINDOWS:
        st[f'rain_sum_{w}h'] = st['rain_mm'].rolling(w,1).sum()
    for l in LAG_HRS:
        st[f'dis_lag_{l}h'] = st['discharge_cms'].shift(l)
    st['dis_rate_1h'] = st['discharge_cms'].diff(1)
    st['dis_rate_3h'] = st['discharge_cms'].diff(3)
    st['API7'] = st['rain_mm'].rolling(API_WINDOW,1).sum()
    st.dropna(inplace=True)
    thr = st['discharge_cms'].quantile(PERCENTILE)
    st['flood'] = (st['discharge_cms'] > thr).astype(int)
    return st


In [33]:
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=SEED, class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(random_state=SEED),
    "AdaBoost": AdaBoostClassifier(random_state=SEED),
    "Logistic Reg": LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(random_state=SEED, class_weight='balanced'),
    "KNN": KNeighborsClassifier(),
    # "SVM": SVC(probability=True, random_state=SEED, class_weight='balanced'),
    "Naive Bayes": GaussianNB(),
    "LDA": LinearDiscriminantAnalysis(),
    "MLP": MLPClassifier(max_iter=800, random_state=SEED),
    "LSTM": Sequential([
        LSTM(100, input_shape=(None, 18)),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ]),
    "LSTM_ES": Sequential([
        LSTM(100, input_shape=(None, 18)),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ]),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=SEED, scale_pos_weight=1),
    "XGB_gpu": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=SEED, scale_pos_weight=1, device='gpu', predictor='gpu_predictor'),
    "LightGBM": LGBMClassifier(random_state=SEED, class_weight='balanced', device='gpu'),
    "LightGBM RF": LGBMClassifier(
        random_state=SEED, 
        class_weight='balanced', 
        device='gpu', 
        boosting_type='rf', 
        n_estimators=1000, 
        num_leaves=31, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        reg_alpha=0.1, 
        reg_lambda=0.1, 
        bagging_fraction=0.8, 
        bagging_freq=5, 
        min_child_samples=20, 
        n_jobs=-1
    )
}


  super().__init__(**kwargs)


In [34]:
def is_fitted(model):
    try:
        check_is_fitted(model)
        return True
    except Exception:
        return False

In [35]:
# Combined dataset – random stratified 70 / 30 split + confusion matrix
frames = [build_features(load_station(c)) for c in STATIONS]
combined = pd.concat(frames)
feat_cols = [c for c in combined.columns if c.startswith(('rain_sum','dis_lag','dis_rate','API'))]
X_comb, y_comb = combined[feat_cols], combined['flood']

X_tr, X_te, y_tr, y_te = train_test_split(
    X_comb, y_comb, test_size=TEST_FRAC, random_state=SEED, stratify=y_comb)


In [None]:
rows = []
print(f"Training {len(classifiers)} classifiers...")
for name, clf in classifiers.items():
   
    start_time = time.time()
    print(f"Training {name}...")
    if not is_fitted(clf):
        if name in ["LSTM", "LSTM_ES"]:
            # Reshape for LSTM input
            clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            X_tr_lstm = X_tr.values.reshape((X_tr.shape[0], 1, X_tr.shape[1]))
            X_te_lstm = X_te.values.reshape((X_te.shape[0], 1, X_te.shape[1]))
            if name == "LSTM_ES":
                # Use EarlyStopping for LSTM with validation split and early stopping
                early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
                clf.fit(X_tr_lstm, y_tr, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stop])
            else:
                # Fit LSTM without validation split or early stopping
                clf.fit(X_tr_lstm, y_tr, epochs=18, batch_size=16)
            prob = clf.predict(X_te_lstm).flatten()
        else:
            if name in ["XGBoost", "XGB_gpu"]:
                # Fit XGBoost and LightGBM classifiers
                clf.fit(X_tr, y_tr, eval_set=[(X_te, y_te)])
                prob = clf.predict_proba(X_te)[:, 1]
            else:
                if name in ["LightGBM", "LightGBM RF"]:
                    # Fit LightGBM classifiers
                    clf.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], eval_metric='binary_logloss')
                    prob = clf.predict_proba(X_te)[:, 1]
                else:
                    # Fit other classifiers    
                    clf.fit(X_tr, y_tr)
                    prob  = clf.predict_proba(X_te)[:, 1] if hasattr(clf, "predict_proba") else clf.decision_function(X_te)
    else:
        print(f"{name} is already fitted, skipping training.")
        if name in ["LSTM", "LSTM_ES"]:
            X_te_lstm = X_te.values.reshape((X_te.shape[0], 1, X_te.shape[1]))
            prob = clf.predict(X_te_lstm).flatten()
        else:
            prob = clf.predict_proba(X_te)[:, 1] if hasattr(clf, "predict_proba") else clf.decision_function(X_te)
    pred  = (prob > 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_te, pred, labels=[0, 1]).ravel()

    rows.append({
        'Model': name,
        'Accuracy': round(accuracy_score(y_te, pred), 3),
        'Precision': round(precision_score(y_te, pred), 3),
        'Recall': round(recall_score(y_te, pred), 3),
        'PR-AUC': round(average_precision_score(y_te, prob), 3),
        'ConfMatrix': f"[[{tn} {fp}] [{fn} {tp}]]"
    })
    elapsed_time = time.time() - start_time
    print(f"{name} - Time: {elapsed_time:.2f}s, Accuracy: {rows[-1]['Accuracy']}, ")

(pd.DataFrame(rows)
   .set_index('Model')
   .sort_values('PR-AUC', ascending=False))

Training 15 classifiers...
Training Random Forest...
Random Forest - Time: 9.39s, Accuracy: 0.998, 
Training Gradient Boosting...
Gradient Boosting - Time: 8.81s, Accuracy: 0.993, 
Training AdaBoost...
AdaBoost - Time: 2.34s, Accuracy: 0.983, 
Training Logistic Reg...
Logistic Reg - Time: 0.75s, Accuracy: 0.908, 
Training Decision Tree...
Decision Tree - Time: 0.28s, Accuracy: 0.996, 
Training KNN...
KNN - Time: 1.61s, Accuracy: 0.986, 
Training Naive Bayes...
Naive Bayes - Time: 0.06s, Accuracy: 0.925, 
Training LDA...
LDA - Time: 0.11s, Accuracy: 0.95, 
Training MLP...
MLP - Time: 32.14s, Accuracy: 0.99, 
Training LSTM...
Epoch 1/18
[1m3477/3477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.9619 - loss: 0.1247
Epoch 2/18
[1m3477/3477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9679 - loss: 0.0974
Epoch 3/18
[1m3477/3477[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - accuracy: 0.9702 - loss: 0.0883
E

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[27]	validation_0-logloss:0.01448
[28]	validation_0-logloss:0.01436
[28]	validation_0-logloss:0.01436
[29]	validation_0-logloss:0.01406
[30]	validation_0-logloss:0.01383
[29]	validation_0-logloss:0.01406
[30]	validation_0-logloss:0.01383
[31]	validation_0-logloss:0.01359
[32]	validation_0-logloss:0.01347
[31]	validation_0-logloss:0.01359
[32]	validation_0-logloss:0.01347
[33]	validation_0-logloss:0.01318
[33]	validation_0-logloss:0.01318
[34]	validation_0-logloss:0.01296
[35]	validation_0-logloss:0.01277
[36]	validation_0-logloss:0.01241
[34]	validation_0-logloss:0.01296
[35]	validation_0-logloss:0.01277
[36]	validation_0-logloss:0.01241
[37]	validation_0-logloss:0.01232
[38]	validation_0-logloss:0.01224
[39]	validation_0-logloss:0.01210
[37]	validation_0-logloss:0.01232
[38]	validation_0-logloss:0.01224
[39]	validation_0-logloss:0.01210
[40]	validation_0-logloss:0.01202
[41]	validation_0-logloss:0.01191
[42]	validation_0-logloss:0.01179
[40]	validation_0-logloss:0.01202
[41]	validatio

Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[9]	validation_0-logloss:0.02574
[10]	validation_0-logloss:0.02388
[10]	validation_0-logloss:0.02388
[11]	validation_0-logloss:0.02245
[12]	validation_0-logloss:0.02117
[11]	validation_0-logloss:0.02245
[12]	validation_0-logloss:0.02117
[13]	validation_0-logloss:0.02008
[13]	validation_0-logloss:0.02008
[14]	validation_0-logloss:0.01895
[14]	validation_0-logloss:0.01895
[15]	validation_0-logloss:0.01853
[16]	validation_0-logloss:0.01813
[15]	validation_0-logloss:0.01853
[16]	validation_0-logloss:0.01813
[17]	validation_0-logloss:0.01771
[18]	validation_0-logloss:0.01712
[19]	validation_0-logloss:0.01674
[17]	validation_0-logloss:0.01771
[18]	validation_0-logloss:0.01712
[19]	validation_0-logloss:0.01674
[20]	validation_0-logloss:0.01652
[21]	validation_0-logloss:0.01625
[22]	validation_0-logloss:0.01572
[20]	validation_0-logloss:0.01652
[21]	validation_0-logloss:0.01625
[22]	validation_0-logloss:0.01572
[23]	validation_0-logloss:0.01542
[24]	validation_0-logloss:0.01511
[25]	validation

In [None]:
from sklearn.ensemble import VotingClassifier

# Seleciona todos os classificadores scikit-learn (exclui LSTM e LSTM_ES)
ensemble_names = [k for k in classifiers.keys() if k not in ["LSTM", "LSTM_ES"]]
voting_estimators = [(name, classifiers[name]) for name in ensemble_names]

voting = VotingClassifier(estimators=voting_estimators, voting='soft')

# Treinamento
voting.fit(X_tr, y_tr)

# Predição e avaliação
prob = voting.predict_proba(X_te)[:, 1]
pred = (prob > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, average_precision_score, confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_te, pred, labels=[0, 1]).ravel()
print(f"VotingClassifier - Accuracy: {accuracy_score(y_te, pred):.3f}, Precision: {precision_score(y_te, pred):.3f}, Recall: {recall_score(y_te, pred):.3f}, PR-AUC: {average_precision_score(y_te, prob):.3f}")
print(f"ConfMatrix: [[{tn} {fp}] [{fn} {tp}]]")

In [None]:
""" from IPython.display import display

for code in STATIONS:
    df = build_features(load_station(code))
    X, y = df[feat_cols], df['flood']
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=TEST_FRAC, random_state=SEED, stratify=y)

    print(f"### {code}")
    tbl = []

    plt.figure(figsize=(5, 4))
    for name, clf in classifiers.items():
        if name in ["LSTM", "LSTM_ES"]:
            # Reshape for LSTM input
            X_tr_lstm = X_tr.values.reshape((X_tr.shape[0], 1, X_tr.shape[1]))
            X_te_lstm = X_te.values.reshape((X_te.shape[0], 1, X_te.shape[1]))
            if name == "LSTM_ES":
                # Use EarlyStopping for LSTM with validation split
                early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
                clf.fit(X_tr_lstm, y_tr, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stop])
            else:
                # Fit LSTM without validation split
                clf.fit(X_tr_lstm, y_tr, epochs=18, batch_size=16)
            prob = clf.predict(X_te_lstm).flatten()
        else:
            if name in ["XGBoost", "XGB_gpu", "LightGBM", "LightGBM RF"]:
                # Fit XGBoost and LightGBM classifiers
                clf.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False)
                prob = clf.predict_proba(X_te)[:, 1]
            else:
                # Fit other classifiers    
                clf.fit(X_tr, y_tr)
                prob  = clf.predict_proba(X_te)[:, 1] if hasattr(clf, "predict_proba") else clf.decision_function(X_te)
        pr_auc = average_precision_score(y_te, prob)
        prec_curve, rec_curve, _ = precision_recall_curve(y_te, prob)
        plt.plot(rec_curve, prec_curve, lw=1, label=f"{name} (AP={pr_auc:.2f})")

        thr  = MY_THR[code]
        pred = (prob > thr).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_te, pred, labels=[0, 1]).ravel()

        tbl.append({
            'Model': name,
            'Precision': round(precision_score(y_te, pred, zero_division=1), 3),
            'Recall': round(recall_score(y_te, pred), 3),
            'PR-AUC': round(pr_auc, 3),
            'ConfMatrix': f"[[{tn} {fp}] [{fn} {tp}]]"
        })

    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.title(f"PR curve – {code}")
    plt.legend(fontsize="xx-small")
    plt.show()

    display(pd.DataFrame(tbl).set_index("Model").sort_values("PR-AUC", ascending=False)) """

' from IPython.display import display\n\nfor code in STATIONS:\n    df = build_features(load_station(code))\n    X, y = df[feat_cols], df[\'flood\']\n    X_tr, X_te, y_tr, y_te = train_test_split(\n        X, y, test_size=TEST_FRAC, random_state=SEED, stratify=y)\n\n    print(f"### {code}")\n    tbl = []\n\n    plt.figure(figsize=(5, 4))\n    for name, clf in classifiers.items():\n        if name in ["LSTM", "LSTM_ES"]:\n            # Reshape for LSTM input\n            X_tr_lstm = X_tr.values.reshape((X_tr.shape[0], 1, X_tr.shape[1]))\n            X_te_lstm = X_te.values.reshape((X_te.shape[0], 1, X_te.shape[1]))\n            if name == "LSTM_ES":\n                # Use EarlyStopping for LSTM with validation split\n                early_stop = EarlyStopping(monitor=\'val_loss\', patience=3, restore_best_weights=True)\n                clf.fit(X_tr_lstm, y_tr, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stop])\n            else:\n                # Fit LSTM without v