In [8]:
import pandas as pd, numpy as np
import time
from pathlib import Path
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, average_precision_score, precision_recall_curve)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_is_fitted
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [9]:
# ── CONFIG ─────────────────────────────────────────────────────
PROJECT_ROOT = '../'
DATA_RAW = PROJECT_ROOT+'data/raw/'
STATIONS = ['D08A071','D08A084','D08A115']
ROLL_WINDOWS = [3,6,12]
LAG_HRS = range(1,13)
API_WINDOW = 24*7
PERCENTILE = 0.92
SEED = 42
TEST_FRAC = 0.30   # 70/30 split
MY_THR = {'D08A071':0.5, 'D08A084':0.5, 'D08A115':0.95}


In [10]:
def load_station(code):
    csv = DATA_RAW  + code + '.csv'
    df = pd.read_csv(csv)
    df['datetime'] = pd.to_datetime(df['saatlik'])
    df = (df.rename(columns={'yagis_toplam':'rain_mm','qdeger':'discharge_cms'})
            .set_index('datetime').sort_index()
            .resample('h').agg({'rain_mm':'sum','discharge_cms':'mean'}))
    return df

def build_features(df):
    st = df.copy()
    for w in ROLL_WINDOWS:
        st[f'rain_sum_{w}h'] = st['rain_mm'].rolling(w,1).sum()
    for l in LAG_HRS:
        st[f'dis_lag_{l}h'] = st['discharge_cms'].shift(l)
    st['dis_rate_1h'] = st['discharge_cms'].diff(1)
    st['dis_rate_3h'] = st['discharge_cms'].diff(3)
    st['API7'] = st['rain_mm'].rolling(API_WINDOW,1).sum()
    st.dropna(inplace=True)
    thr = st['discharge_cms'].quantile(PERCENTILE)
    st['flood'] = (st['discharge_cms'] > thr).astype(int)
    return st


In [15]:
classifiers = {
    # "Random Forest": RandomForestClassifier(n_estimators=200, random_state=SEED, class_weight='balanced'),
    # "Gradient Boosting": GradientBoostingClassifier(random_state=SEED),
    # "AdaBoost": AdaBoostClassifier(random_state=SEED),
    # "Logistic Reg": LogisticRegression(max_iter=1000, random_state=SEED, class_weight='balanced'),
    # "Decision Tree": DecisionTreeClassifier(random_state=SEED, class_weight='balanced'),
    # "KNN": KNeighborsClassifier(),
    # # "SVM": SVC(probability=True, random_state=SEED, class_weight='balanced'),
    # "Naive Bayes": GaussianNB(),
    # "LDA": LinearDiscriminantAnalysis(),
    # "MLP": MLPClassifier(max_iter=800, random_state=SEED),
    # "LSTM": Sequential([
    #     LSTM(100, input_shape=(None, 18)),
    #     Dropout(0.2),
    #     Dense(1, activation='sigmoid')
    # ]),
    # "LSTM_ES": Sequential([
    #     LSTM(100, input_shape=(None, 18)),
    #     Dropout(0.2),
    #     Dense(1, activation='sigmoid')
    # ]),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=SEED, scale_pos_weight=1),
    "XGB_gpu": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=SEED, scale_pos_weight=1, device='gpu', predictor='gpu_predictor'),
    "LightGBM": LGBMClassifier(random_state=SEED, class_weight='balanced', device='gpu'),
    "LightGBM RF": LGBMClassifier(
        random_state=SEED, 
        class_weight='balanced', 
        device='gpu', 
        boosting_type='rf', 
        n_estimators=1000, 
        num_leaves=31, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        reg_alpha=0.1, 
        reg_lambda=0.1, 
        bagging_fraction=0.8, 
        bagging_freq=5, 
        min_child_samples=20, 
        n_jobs=-1
    )
}


In [12]:
def is_fitted(model):
    try:
        check_is_fitted(model)
        return True
    except Exception:
        return False

In [13]:
# Combined dataset – random stratified 70 / 30 split + confusion matrix
frames = [build_features(load_station(c)) for c in STATIONS]
combined = pd.concat(frames)
feat_cols = [c for c in combined.columns if c.startswith(('rain_sum','dis_lag','dis_rate','API'))]
X_comb, y_comb = combined[feat_cols], combined['flood']

X_tr, X_te, y_tr, y_te = train_test_split(
    X_comb, y_comb, test_size=TEST_FRAC, random_state=SEED, stratify=y_comb)


In [16]:
rows = []
print(f"Training {len(classifiers)} classifiers...")
for name, clf in classifiers.items():
   
    start_time = time.time()
    print(f"Training {name}...")
    if not is_fitted(clf):
        if name in ["LSTM", "LSTM_ES"]:
            # Reshape for LSTM input
            clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            X_tr_lstm = X_tr.values.reshape((X_tr.shape[0], 1, X_tr.shape[1]))
            X_te_lstm = X_te.values.reshape((X_te.shape[0], 1, X_te.shape[1]))
            if name == "LSTM_ES":
                # Use EarlyStopping for LSTM with validation split and early stopping
                early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
                clf.fit(X_tr_lstm, y_tr, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stop])
            else:
                # Fit LSTM without validation split or early stopping
                clf.fit(X_tr_lstm, y_tr, epochs=18, batch_size=16)
            prob = clf.predict(X_te_lstm).flatten()
        else:
            if name in ["XGBoost", "XGB_gpu"]:
                # Fit XGBoost and LightGBM classifiers
                clf.fit(X_tr, y_tr, eval_set=[(X_te, y_te)])
                prob = clf.predict_proba(X_te)[:, 1]
            else:
                if name in ["LightGBM", "LightGBM RF"]:
                    # Fit LightGBM classifiers
                    clf.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], eval_metric='binary_logloss')
                    prob = clf.predict_proba(X_te)[:, 1]
                else:
                    # Fit other classifiers    
                    clf.fit(X_tr, y_tr)
                    prob  = clf.predict_proba(X_te)[:, 1] if hasattr(clf, "predict_proba") else clf.decision_function(X_te)
    else:
        print(f"{name} is already fitted, skipping training.")
        if name in ["LSTM", "LSTM_ES"]:
            X_te_lstm = X_te.values.reshape((X_te.shape[0], 1, X_te.shape[1]))
            prob = clf.predict(X_te_lstm).flatten()
        else:
            prob = clf.predict_proba(X_te)[:, 1] if hasattr(clf, "predict_proba") else clf.decision_function(X_te)
    pred  = (prob > 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_te, pred, labels=[0, 1]).ravel()

    rows.append({
        'Model': name,
        'Accuracy': round(accuracy_score(y_te, pred), 3),
        'Precision': round(precision_score(y_te, pred), 3),
        'Recall': round(recall_score(y_te, pred), 3),
        'PR-AUC': round(average_precision_score(y_te, prob), 3),
        'ConfMatrix': f"[[{tn} {fp}] [{fn} {tp}]]"
    })
    elapsed_time = time.time() - start_time
    print(f"{name} - Time: {elapsed_time:.2f}s, Accuracy: {rows[-1]['Accuracy']}, ")

(pd.DataFrame(rows)
   .set_index('Model')
   .sort_values('PR-AUC', ascending=False))

Training 4 classifiers...
Training XGBoost...
[0]	validation_0-logloss:0.11091
[1]	validation_0-logloss:0.08581
[2]	validation_0-logloss:0.06840
[3]	validation_0-logloss:0.05553
[4]	validation_0-logloss:0.04729
[5]	validation_0-logloss:0.03969
[6]	validation_0-logloss:0.03467
[7]	validation_0-logloss:0.03089
[8]	validation_0-logloss:0.02787
[9]	validation_0-logloss:0.02538
[10]	validation_0-logloss:0.02372
[11]	validation_0-logloss:0.02246
[12]	validation_0-logloss:0.02165
[13]	validation_0-logloss:0.02055
[14]	validation_0-logloss:0.01980
[15]	validation_0-logloss:0.01942
[16]	validation_0-logloss:0.01888
[17]	validation_0-logloss:0.01838
[18]	validation_0-logloss:0.01788
[19]	validation_0-logloss:0.01746
[20]	validation_0-logloss:0.01689
[21]	validation_0-logloss:0.01647
[22]	validation_0-logloss:0.01613
[23]	validation_0-logloss:0.01580
[24]	validation_0-logloss:0.01553
[25]	validation_0-logloss:0.01510
[26]	validation_0-logloss:0.01484


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[27]	validation_0-logloss:0.01448
[28]	validation_0-logloss:0.01436
[29]	validation_0-logloss:0.01406
[30]	validation_0-logloss:0.01383
[31]	validation_0-logloss:0.01359
[32]	validation_0-logloss:0.01347
[33]	validation_0-logloss:0.01318
[34]	validation_0-logloss:0.01296
[35]	validation_0-logloss:0.01277
[36]	validation_0-logloss:0.01241
[37]	validation_0-logloss:0.01232
[38]	validation_0-logloss:0.01224
[39]	validation_0-logloss:0.01210
[40]	validation_0-logloss:0.01202
[41]	validation_0-logloss:0.01191
[42]	validation_0-logloss:0.01179
[43]	validation_0-logloss:0.01168
[44]	validation_0-logloss:0.01166
[45]	validation_0-logloss:0.01154
[46]	validation_0-logloss:0.01145
[47]	validation_0-logloss:0.01137
[48]	validation_0-logloss:0.01128
[49]	validation_0-logloss:0.01120
[50]	validation_0-logloss:0.01106
[51]	validation_0-logloss:0.01101
[52]	validation_0-logloss:0.01088
[53]	validation_0-logloss:0.01076
[54]	validation_0-logloss:0.01068
[55]	validation_0-logloss:0.01063
[56]	validatio

Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[6]	validation_0-logloss:0.03463
[7]	validation_0-logloss:0.03090
[8]	validation_0-logloss:0.02825
[9]	validation_0-logloss:0.02574
[10]	validation_0-logloss:0.02388
[11]	validation_0-logloss:0.02245
[12]	validation_0-logloss:0.02117
[13]	validation_0-logloss:0.02008
[14]	validation_0-logloss:0.01895
[15]	validation_0-logloss:0.01853
[16]	validation_0-logloss:0.01813
[17]	validation_0-logloss:0.01771
[18]	validation_0-logloss:0.01712
[19]	validation_0-logloss:0.01674
[20]	validation_0-logloss:0.01652
[21]	validation_0-logloss:0.01625
[22]	validation_0-logloss:0.01572
[23]	validation_0-logloss:0.01542
[24]	validation_0-logloss:0.01511
[25]	validation_0-logloss:0.01485
[26]	validation_0-logloss:0.01460
[27]	validation_0-logloss:0.01444
[28]	validation_0-logloss:0.01419
[29]	validation_0-logloss:0.01405
[30]	validation_0-logloss:0.01382
[31]	validation_0-logloss:0.01366
[32]	validation_0-logloss:0.01350
[33]	validation_0-logloss:0.01337
[34]	validation_0-logloss:0.01331
[35]	validation_0-

Unnamed: 0_level_0,Accuracy,Precision,Recall,PR-AUC,ConfMatrix
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LightGBM,0.997,0.975,0.986,0.999,[[21969 46] [26 1797]]
XGBoost,0.997,0.989,0.97,0.997,[[21996 19] [55 1768]]
XGB_gpu,0.997,0.989,0.967,0.997,[[21996 19] [61 1762]]
LightGBM RF,0.979,0.789,0.99,0.956,[[21532 483] [19 1804]]


In [14]:
from sklearn.ensemble import VotingClassifier

# Select classifiers for ensemble voting
# Exclude LSTM and LSTM_ES as they are not compatible with VotingClassifier
ensemble_names = [k for k in classifiers.keys() if k not in ["LSTM", "LSTM_ES"]]
voting_estimators = [(name, classifiers[name]) for name in ensemble_names]

voting = VotingClassifier(estimators=voting_estimators, voting='soft')

# Training
voting.fit(X_tr, y_tr)

#Prediction and evaluation
prob = voting.predict_proba(X_te)[:, 1]
pred = (prob > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, average_precision_score, confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_te, pred, labels=[0, 1]).ravel()
print(f"VotingClassifier - Accuracy: {accuracy_score(y_te, pred):.3f}, Precision: {precision_score(y_te, pred):.3f}, Recall: {recall_score(y_te, pred):.3f}, PR-AUC: {average_precision_score(y_te, prob):.3f}")
print(f"ConfMatrix: [[{tn} {fp}] [{fn} {tp}]]")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 4252, number of negative: 51369
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 55621, number of used features: 18
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1650, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 13 dense feature groups (0.85 MB) transferred to GPU in 0.002604 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 4252, number of negative: 51369
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 55621, number of used features: 18
[LightGBM] [Info] Using GPU 

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


VotingClassifier - Accuracy: 0.996, Precision: 0.986, Recall: 0.966, PR-AUC: 0.997
ConfMatrix: [[21990 25] [62 1761]]
