In [2]:
# model_trainer.py
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

# 1. Load the latest dataset
try:
    df = pd.read_csv('state_specific_village_dataset.csv')
except FileNotFoundError:
    print("Error: 'state_specific_village_dataset.csv' not found. Please generate it first.")
    exit()

# Drop 'Village' as it's too granular for a general model, but keep 'State'
df = df.drop(columns=['Village'])

print("✅ Successfully loaded the state-specific village dataset.")
X = df.drop('OutbreakStatus', axis=1)
y = df['OutbreakStatus']

# 2. Identify column types
categorical_features = X.select_dtypes(include=['object']).drop(columns=['CommunityNotes']).columns.tolist()
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
boolean_features = X.select_dtypes(include=['bool']).columns.tolist()
text_feature = 'CommunityNotes'

# 3. Create the preprocessors
structured_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='if_binary'), categorical_features),
        ('num', StandardScaler(), numeric_features),
        ('bool', 'passthrough', boolean_features)
    ],
    remainder='drop'
)
text_preprocessor = TfidfVectorizer(stop_words='english', max_features=100, ngram_range=(1,2))

# Encode the target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split and preprocess the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
X_train_structured = structured_preprocessor.fit_transform(X_train)
X_test_structured = structured_preprocessor.transform(X_test)
X_train_text = text_preprocessor.fit_transform(X_train[text_feature])
X_test_text = text_preprocessor.transform(X_test[text_feature])
X_train_final = hstack([X_train_structured, X_train_text]).tocsr()
X_test_final = hstack([X_test_structured, X_test_text]).tocsr()

print("✅ Preprocessing complete. Final training data shape:", X_train_final.shape)

# 4. Define Models and Hyperparameter Grids
models_to_tune = {
    'LogisticRegression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.1, 1.0, 10.0],
            'solver': ['liblinear']
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(random_state=42, n_jobs=-1),
        'params': {
            'n_estimators': [100, 150],
            'max_depth': [10, 20],
            'min_samples_leaf': [1, 2]
        }
    },
    'LGBMClassifier': {
        'model': LGBMClassifier(random_state=42, verbosity=-1),
        'params': {
            'n_estimators': [100, 150],
            'learning_rate': [0.1, 0.05],
            'num_leaves': [31, 40]
        }
    }
}

# 5. Perform Hyperparameter Tuning for all models
print("\n--- 🚀 Starting Hyperparameter Tuning for All Models ---")
best_estimators = {}
for model_name, config in models_to_tune.items():
    print(f"\n--- Tuning {model_name} ---")
    grid_search = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=3, # Using 3-fold CV for faster tuning
        scoring='accuracy',
        verbose=1,
        n_jobs=-1
    )
    grid_search.fit(X_train_final, y_train)
    
    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best CV Accuracy for {model_name}: {grid_search.best_score_:.4f}")
    
    best_estimators[model_name] = {
        'score': grid_search.best_score_,
        'estimator': grid_search.best_estimator_
    }

# 6. Select the Overall Best Model
print("\n--- 🏆 Identifying the Best Overall Model ---")
best_model_name = max(best_estimators, key=lambda name: best_estimators[name]['score'])
best_model = best_estimators[best_model_name]['estimator']
best_score = best_estimators[best_model_name]['score']

print(f"The best overall model is: **{best_model_name}** with a cross-validation accuracy of {best_score:.4f}")

# 7. Final Evaluation on the Unseen Test Set
print("\n--- 🧪 Final Evaluation on Test Set ---")
y_pred = best_model.predict(X_test_final)
final_accuracy = accuracy_score(y_test, y_pred)

print(f"Final Test Set Accuracy of {best_model_name}: {final_accuracy:.4f}")
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# 8. Save all components
print(f"\n--- 💾 Saving the best model '{best_model_name}' and all pipeline components ---")
with open('best_model.pkl', 'wb') as f: pickle.dump(best_model, f)
with open('structured_preprocessor.pkl', 'wb') as f: pickle.dump(structured_preprocessor, f)
with open('text_preprocessor.pkl', 'wb') as f: pickle.dump(text_preprocessor, f)
with open('label_encoder.pkl', 'wb') as f: pickle.dump(le, f)
print("\n✅ All pipeline components have been saved successfully.")

✅ Successfully loaded the state-specific village dataset.
✅ Preprocessing complete. Final training data shape: (4000, 79)

--- 🚀 Starting Hyperparameter Tuning for All Models ---

--- Tuning LogisticRegression ---
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best Parameters for LogisticRegression: {'C': 10.0, 'solver': 'liblinear'}
Best CV Accuracy for LogisticRegression: 0.8300

--- Tuning RandomForestClassifier ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Parameters for RandomForestClassifier: {'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 150}
Best CV Accuracy for RandomForestClassifier: 0.8078

--- Tuning LGBMClassifier ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Parameters for LGBMClassifier: {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 31}
Best CV Accuracy for LGBMClassifier: 0.8175

--- 🏆 Identifying the Best Overall Model ---
The best overall model is: **LogisticRegression** with a cross-vali