In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  # <- use imblearn's pipeline

# Load dataset
df = pd.read_csv('predicted_rockfall_risk.csv')

# Define target and features
target_col = 'high_risk_flag'
X = df.drop(columns=[target_col, 'rockfall_risk', 'rockfall_event'])
y = df[target_col]

# Identify categorical & numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['number']).columns

# Preprocessor: scale numbers, one-hot encode categories
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Build full pipeline with SMOTE + classifier
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(C=0.001, solver='liblinear'))
])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit pipeline (preprocess → SMOTE → train classifier)
full_pipeline.fit(X_train, y_train)

# Save the trained pipeline
joblib.dump(full_pipeline, 'full_rockfall_pipeline.pkl')
print("✅ Pipeline trained and saved to 'full_rockfall_pipeline.pkl'")

# --- Example: Load & predict later ---
# model = joblib.load('full_rockfall_pipeline.pkl')
# predictions = model.predict(X_test)
# print(predictions)


✅ Pipeline trained and saved to 'full_rockfall_pipeline.pkl'
