# Optimized Drought Prediction Model
## Using XGBoost with Hyperparameter Tuning & SMOTE for Class Balance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('USDMData.csv')
df = df.ffill()  # Forward fill missing values

# Separate features and labels
features = df.drop(columns=['DroughtCategory'], errors='ignore')

# Convert object columns to numeric
for col in features.select_dtypes(include='object').columns:
    features[col] = features[col].astype('category').cat.codes

# Encode labels
labels = df['DroughtCategory'] if 'DroughtCategory' in df.columns else df.iloc[:, -1]
le = LabelEncoder()
labels = le.fit_transform(labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

print("Original class distribution (training):", Counter(y_train))
print(f"Dataset shape: X_train={X_train.shape}, X_test={X_test.shape}")

              precision    recall  f1-score   support

           0       0.87      0.90      0.88        51
           1       0.89      0.87      0.88        63
           2       0.97      0.96      0.96        71
           3       0.40      0.29      0.33         7
           4       0.50      0.62      0.56         8

    accuracy                           0.88       200
   macro avg       0.73      0.73      0.72       200
weighted avg       0.88      0.88      0.88       200

Confusion Matrix:
 [[46  5  0  0  0]
 [ 5 55  2  0  1]
 [ 2  1 68  0  0]
 [ 0  1  0  2  4]
 [ 0  0  0  3  5]]


In [None]:
# Hyperparameter Tuning with GridSearchCV
print("\n" + "="*60)
print("STEP 1: Hyperparameter Tuning with GridSearchCV")
print("="*60)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    XGBClassifier(random_state=42, verbosity=0),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")

best_model_params = grid_search.best_params_



Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Best CV Score: 0.8888
Tuned Model Test Accuracy: 0.8750

Tuned Model Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89        51
           1       0.98      0.86      0.92        63
           2       0.85      0.94      0.89        71
           3       0.50      0.43      0.46         7
           4       0.62      0.62      0.62         8

    accuracy                           0.88       200
   macro avg       0.77      0.75      0.76       200
weighted avg       0.88      0.88      0.87       200

Tuned Confusion Matrix:
 [[46  0  5  0  0]
 [ 3 54  6  0  0]
 [ 3  1 67  0  0]
 [ 0  0  1  3  3]
 [ 0  0  0  3  5]]


In [None]:
# Apply SMOTE for Class Imbalance & Train Final Model
print("\n" + "="*60)
print("STEP 2: Handling Class Imbalance with SMOTE")
print("="*60)

# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Original class distribution:", Counter(y_train))
print("Balanced class distribution:", Counter(y_train_balanced))

# Train final model with best parameters on balanced data
print("\n" + "="*60)
print("STEP 3: Training Final Model")
print("="*60)

final_model = XGBClassifier(**best_model_params, random_state=42)
final_model.fit(X_train_balanced, y_train_balanced, verbose=0)

# Predictions
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nFinal Model Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

Original class distribution: Counter({np.int64(1): 280, np.int64(2): 254, np.int64(0): 186, np.int64(3): 40, np.int64(4): 39, np.int64(5): 1})


ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 2, n_samples_fit = 1, n_samples = 1

In [None]:
# Confusion Matrix Visualization
print("\n" + "="*60)
print("STEP 4: Model Performance Visualization")
print("="*60)

plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_,
            cbar_kws={'label': 'Count'})
plt.title(f'Confusion Matrix - XGBoost\nAccuracy: {accuracy:.4f}', fontsize=14, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.tight_layout()
plt.savefig('../results/confusion_matrix.png', dpi=300, bbox_inches='tight')
print("\n✓ Confusion matrix saved to results/confusion_matrix.png")
plt.show()

# Feature Importance
print("\n" + "="*60)
print("Feature Importance Analysis")
print("="*60)

feature_importance = final_model.feature_importances_
feature_names = features.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(importance_df.head(10).to_string())

# Save model
import joblib
joblib.dump(final_model, '../models/drought_prediction_xgboost.pkl')
joblib.dump(le, '../models/label_encoder.pkl')
print("\n✓ Model and encoder saved to models/")
print("\n" + "="*60)
print(f"✓ PROJECT COMPLETE - Final Accuracy: {accuracy:.4f}")
print("="*60)