In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

def load_data(csv_path, frac=0.1, random_state=42):
    df = pd.read_csv(csv_path)
    # Use only a fraction of the dataset for training/testing
    df = df.sample(frac=frac, random_state=random_state)
    return df

def feature_engineering(df):
    rainfall_cols = ['JANr', 'FEBr', 'MARr', 'APRr', 'MAYr', 'JUNr',
                     'JULr', 'AUGr', 'SEPr', 'OCTr', 'NOVr', 'DECr']
    temp_cols = ['JANt', 'FEBt', 'MARt', 'APRt', 'MAYt', 'JUNt',
                 'JULt', 'AUGt', 'SEPt', 'OCTt', 'NOVt', 'DECt']
    df['total_annual_rainfall'] = df[rainfall_cols].sum(axis=1)
    df['max_monthly_rainfall'] = df[rainfall_cols].max(axis=1)
    df['temperature_range'] = df[temp_cols].max(axis=1) - df[temp_cols].min(axis=1)
    le = LabelEncoder()
    df['location_encoded'] = le.fit_transform(df['location'])
    return df, le

def select_features(df):
    exclude_cols = ['rockfall', 'rockfall_probability', 'location', 'longitude', 'latitude']
    feature_cols = [c for c in df.columns if c not in exclude_cols]
    numeric_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
    return numeric_cols

def evaluate_and_stack(X_train, y_train, X_test, y_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 1. Fit and evaluate each model individually
    models = {
        'ExtraTrees': ExtraTreesClassifier(n_estimators=50, max_depth=5, min_samples_split=10, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=301, weights='uniform'),  # <-- K set to 99 here
        'LogisticRegression': LogisticRegression(max_iter=1500, penalty='l2', solver='liblinear', random_state=42)
    }

    individual_results = {}
    for model_name, model in models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        acc = accuracy_score(y_test, y_pred)
        individual_results[model_name] = acc
        print(f"{model_name} Test Accuracy: {acc:.4f}")

    # 2. Stacking ensemble using performance-weighted meta learner
    estimators = [
        ('extratrees', models['ExtraTrees']),
        ('knn', models['KNN']),
        ('lr', models['LogisticRegression'])
    ]
    meta_learner = LogisticRegression()
    stacking = StackingClassifier(estimators=estimators, final_estimator=meta_learner, cv=5, n_jobs=-1)
    stacking.fit(X_train_scaled, y_train)
    y_pred_stack = stacking.predict(X_test_scaled)
    acc_stack = accuracy_score(y_test, y_pred_stack)
    print("\nStacking Ensemble Test Accuracy: {:.4f}".format(acc_stack))
    print("Precision:", precision_score(y_test, y_pred_stack))
    print("Recall:", recall_score(y_test, y_pred_stack))
    print("Classification Report:\n", classification_report(y_test, y_pred_stack, digits=4))
    print("Meta Learner Coefs (weights):", stacking.final_estimator_.coef_)
    print("Meta Learner Intercept (bias):", stacking.final_estimator_.intercept_)
    return stacking, scaler, individual_results

def main():
    df = load_data('airockfalldata.csv', frac=0.1)  # <-- fraction of random data here
    df, location_encoder = feature_engineering(df)
    feature_cols = select_features(df)
    X = df[feature_cols].fillna(0)
    y = (df['rockfall'] == 'Yes').astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=42, stratify=y)
    model, scaler, individual_results = evaluate_and_stack(X_train, y_train, X_test, y_test)
    # Save outputs
    joblib.dump(model, 'models/rockfall_stacking_model.pkl')
    joblib.dump(scaler, 'models/scaler.pkl')
    joblib.dump(feature_cols, 'models/feature_names.pkl')
    joblib.dump(location_encoder, 'models/location_encoder.pkl')
    joblib.dump(individual_results, 'models/individual_model_accuracies.pkl')
    print("Model, scaler, and individual accuracies saved.")

if __name__ == "__main__":
    main()


ExtraTrees Test Accuracy: 0.8588
KNN Test Accuracy: 0.9511
LogisticRegression Test Accuracy: 0.9478

Stacking Ensemble Test Accuracy: 0.9616
Precision: 0.9528501055594651
Recall: 0.9136302294197031
Classification Report:
               precision    recall  f1-score   support

           0     0.9649    0.9813    0.9731     14361
           1     0.9529    0.9136    0.9328      5928

    accuracy                         0.9616     20289
   macro avg     0.9589    0.9475    0.9529     20289
weighted avg     0.9614    0.9616    0.9613     20289

Meta Learner Coefs (weights): [[3.77494531 5.1772793  5.03719386]]
Meta Learner Intercept (bias): [-5.58089357]


FileNotFoundError: [Errno 2] No such file or directory: 'models/rockfall_stacking_model.pkl'