# 🌧️ Rainfall Prediction with Advanced EDA, Preprocessing, Ensemble Models and Optimization
In this notebook, we explore, process, model, and optimize predictions for the 2025 Kaggle Playground Series (S5E3) rainfall prediction challenge. We apply ensemble learning (LightGBM, XGBoost, CatBoost) with a stacking meta-model, use feature engineering, and include detailed model evaluation with ROC and Confusion Matrix.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e3/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e3/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e3/sample_submission.csv')

## 📊 Exploratory Data Analysis (EDA)

In [3]:
sns.countplot(x='rainfall', data=train)
plt.title('Rainfall Distribution')
plt.show()

plt.figure(figsize=(12,8))
sns.heatmap(train.drop(['id'], axis=1).corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

## 🛠️ Feature Engineering & Preprocessing

In [4]:
train['temp_range'] = train['maxtemp'] - train['mintemp']
train['humidity_pressure_ratio'] = train['humidity'] / train['pressure']
test['temp_range'] = test['maxtemp'] - test['mintemp']
test['humidity_pressure_ratio'] = test['humidity'] / test['pressure']

X = train.drop(['id', 'rainfall'], axis=1)
y = train['rainfall']
X_test = test.drop(['id'], axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

## 🔍 Train/Validation Split

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

## 🌲 Base Models and Meta-Model

In [6]:
lgb = LGBMClassifier(n_estimators=500, learning_rate=0.05, random_state=42)
xgb = XGBClassifier(n_estimators=500, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', random_state=42)
cat = CatBoostClassifier(iterations=500, learning_rate=0.05, verbose=0, random_state=42)
meta_model = LogisticRegression(max_iter=500)

## 🤖 Ensemble Learning with Stacking

In [7]:
ensemble = StackingClassifier(
    estimators=[('lgb', lgb), ('xgb', xgb), ('cat', cat)],
    final_estimator=meta_model,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1
)

ensemble.fit(X_train, y_train)
val_preds = ensemble.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, val_preds)
print("Validation AUC-ROC:", roc)

## 📉 ROC Curve & Confusion Matrix

In [8]:
fpr, tpr, _ = roc_curve(y_val, val_preds)
plt.plot(fpr, tpr, label=f'AUC = {roc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

thresholded_preds = (val_preds > 0.5).astype(int)
cm = confusion_matrix(y_val, thresholded_preds)
ConfusionMatrixDisplay(cm, display_labels=["No Rain", "Rain"]).plot()
plt.title('Confusion Matrix')
plt.show()

## 📤 Submission

In [9]:
final_preds = ensemble.predict_proba(X_test_scaled)[:, 1]
submission['rainfall'] = final_preds
submission.to_csv('submission.csv', index=False)
submission.head()