In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

def load_sample_data(csv_path, frac=0.1, random_state=42):
    df = pd.read_csv(csv_path)
    df = df.sample(frac=frac, random_state=random_state)
    return df

def feature_engineering(df):
    rainfall_cols = ['JANr', 'FEBr', 'MARr', 'APRr', 'MAYr', 'JUNr', 
                     'JULr', 'AUGr', 'SEPr', 'OCTr', 'NOVr', 'DECr']
    temp_cols = ['JANt', 'FEBt', 'MARt', 'APRt', 'MAYt', 'JUNt',
                 'JULt', 'AUGt', 'SEPt', 'OCTt', 'NOVt', 'DECt']
    df['total_annual_rainfall'] = df[rainfall_cols].sum(axis=1)
    df['max_monthly_rainfall'] = df[rainfall_cols].max(axis=1)
    df['temperature_range'] = df[temp_cols].max(axis=1) - df[temp_cols].min(axis=1)
    le = LabelEncoder()
    df['location_encoded'] = le.fit_transform(df['location'])
    return df, le

def select_features(df):
    exclude_cols = ['rockfall', 'rockfall_probability', 'location', 'longitude', 'latitude']
    feature_cols = [c for c in df.columns if c not in exclude_cols]
    numeric_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
    return numeric_cols

def train_models(X_train, y_train, X_test, y_test):
    models = {
        'ExtraTrees': (ExtraTreesClassifier(random_state=42), {
            'n_estimators': [100],
            'max_depth': [10],  # lower depth to avoid overfitting
            'min_samples_split': [5]
        }),
        'KNN': (KNeighborsClassifier(), {
            'n_neighbors': [5, 7],
            'weights': ['uniform']
        }),
        'LogisticRegression': (LogisticRegression(max_iter=500, random_state=42), {
            'C': [1],
            'penalty': ['l2'],
            'solver': ['liblinear']
        }),
    }
    best_model = None
    best_score = 0
    best_name = None
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    for name, (model, params) in models.items():
        print(f"Training {name}...")
        if name in ['KNN', 'LogisticRegression']:
            X_tr, X_te = X_train_scaled, X_test_scaled
        else:
            X_tr, X_te = X_train, X_test
        gs = GridSearchCV(model, params, cv=3, n_jobs=-1, scoring='accuracy')
        gs.fit(X_tr, y_train)
        y_pred = gs.predict(X_te)
        acc = accuracy_score(y_test, y_pred) * 100
        print(f"{name} best params: {gs.best_params_}")
        print(f"{name} Test accuracy: {acc:.2f}%")
        print("Classification report:\n", classification_report(y_test, y_pred))
        print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
        if acc > best_score and acc < 99.5:  # avoid overfitting/high scores
            best_score = acc
            best_model = gs.best_estimator_
            best_name = name
    print(f"\nSelected Best Model: {best_name}")
    print(f"Best Test Accuracy: {best_score:.2f}%")
    return best_model, best_name, scaler

def main():
    print("Loading sample data...")
    df = load_sample_data('airockfalldata.csv', frac=0.1)
    print("Feature engineering...")
    df, location_encoder = feature_engineering(df)
    feature_cols = select_features(df)
    X = df[feature_cols].fillna(0)
    y = (df['rockfall'] == 'Yes').astype(int)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)
    best_model, best_name, scaler = train_models(X_train, y_train, X_test, y_test)
    joblib.dump(best_model, 'models/rockfall_best_model.pkl')
    joblib.dump(scaler, 'models/scaler.pkl')
    joblib.dump(feature_cols, 'models/feature_names.pkl')
    joblib.dump(location_encoder, 'models/location_encoder.pkl')
    print(f"Model {best_name} saved successfully.")
if __name__ == "__main__":
    main()


Loading sample data...
Feature engineering...
Training ExtraTrees...
ExtraTrees best params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
ExtraTrees Test accuracy: 94.71%
Classification report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96     14361
           1       1.00      0.82      0.90      5928

    accuracy                           0.95     20289
   macro avg       0.97      0.91      0.93     20289
weighted avg       0.95      0.95      0.95     20289

Confusion matrix:
 [[14360     1]
 [ 1072  4856]]
Training KNN...
KNN best params: {'n_neighbors': 5, 'weights': 'uniform'}
KNN Test accuracy: 98.95%
Classification report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     14361
           1       0.99      0.97      0.98      5928

    accuracy                           0.99     20289
   macro avg       0.99      0.98      0.99     20289
weighted avg  

FileNotFoundError: [Errno 2] No such file or directory: 'models/rockfall_best_model.pkl'