In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("weather.csv")

# Select features and label
features = ['Humidity3pm', 'Pressure9am', 'Temp3pm', 'Cloud3pm', 'WindSpeed3pm', 'RainToday']
df = df[features + ['RainTomorrow', 'Rainfall']]

# Drop NaNs
df.dropna(inplace=True)

# Encode categorical variables
le = LabelEncoder()
df['RainToday'] = le.fit_transform(df['RainToday'])
df['RainTomorrow'] = le.fit_transform(df['RainTomorrow'])

# Split features and label
X = df[features]
y = df['RainTomorrow']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split balanced data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# Tune with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Best model
clf = grid_search.best_estimator_
print("✅ Best Parameters:", grid_search.best_params_)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


✅ Best Parameters: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     12367
           1       0.88      0.89      0.89     12469

    accuracy                           0.89     24836
   macro avg       0.89      0.89      0.89     24836
weighted avg       0.89      0.89      0.89     24836



In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Use only rows where it actually rained (RainTomorrow = 1)
rain_df = df[df['RainTomorrow'] == 1]

X_reg = rain_df[features]
y_reg = rain_df['Rainfall']

# Scale using same scaler as before
X_reg_scaled = scaler.transform(X_reg)

# Train-test split for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg_scaled, y_reg, test_size=0.2, random_state=42
)

# Train regression model
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_reg, y_train_reg)

# Evaluate
y_pred_reg = reg.predict(X_test_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
r2 = r2_score(y_test_reg, y_pred_reg)
print(f"🌧️ Rainfall RMSE: {rmse:.2f} mm, R² score: {r2:.2f}")


🌧️ Rainfall RMSE: 13.39 mm, R² score: 0.20


In [5]:
import joblib

joblib.dump(clf, 'rain_classifier.pkl')
joblib.dump(reg, 'rain_regressor.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [4]:
import pandas as pd
import joblib

# Load models and scaler
clf = joblib.load('rain_classifier.pkl')
reg = joblib.load('rain_regressor.pkl')
scaler = joblib.load('scaler.pkl')

# Define test cases
test_cases = [
    {
        'name': '🌧️ Test Case 1 — Typical Rainy Day',
        'data': {
            'Humidity3pm': 90, 'Pressure9am': 1007, 'Temp3pm': 22.5,
            'Cloud3pm': 7, 'WindSpeed3pm': 20, 'RainToday': 1
        }
    },
    {
        'name': '☀️ Test Case 2 — Hot, Clear Day',
        'data': {
            'Humidity3pm': 25, 'Pressure9am': 1018, 'Temp3pm': 34.0,
            'Cloud3pm': 1, 'WindSpeed3pm': 5, 'RainToday': 0
        }
    },
    {
        'name': '🌧️ Test Case 3 — Overcast + Low Temp',
        'data': {
            'Humidity3pm': 88, 'Pressure9am': 1003, 'Temp3pm': 19.0,
            'Cloud3pm': 8, 'WindSpeed3pm': 12, 'RainToday': 1
        }
    },
    {
        'name': '🌥️ Test Case 4 — Cloudy But Dry',
        'data': {
            'Humidity3pm': 65, 'Pressure9am': 1012, 'Temp3pm': 25.0,
            'Cloud3pm': 6, 'WindSpeed3pm': 15, 'RainToday': 0
        }
    },
    {
        'name': '🌧️ Test Case 5 — Stormy-Like Pressure Drop',
        'data': {
            'Humidity3pm': 95, 'Pressure9am': 998, 'Temp3pm': 20.0,
            'Cloud3pm': 8, 'WindSpeed3pm': 28, 'RainToday': 1
        }
    },
    {
        'name': '☀️ Test Case 6 — Warm, Low Humidity',
        'data': {
            'Humidity3pm': 30, 'Pressure9am': 1022, 'Temp3pm': 31.0,
            'Cloud3pm': 0, 'WindSpeed3pm': 8, 'RainToday': 0
        }
    }
]

# Run all test cases
for case in test_cases:
    print(f"\n🔎 {case['name']}")
    X_new = pd.DataFrame([case['data']])
    X_new_scaled = scaler.transform(X_new)

    rain_prediction = clf.predict(X_new_scaled)[0]
    rain_prob = clf.predict_proba(X_new_scaled)[0][1]

    if rain_prediction == 1:
        predicted_rainfall = reg.predict(X_new_scaled)[0]
        print(f"🌧️ Rain expected with {rain_prob * 100:.2f}% probability.")
        print(f"🌧️ Estimated rainfall: {predicted_rainfall:.2f} mm")
    else:
        print(f"☀️ No rain expected. Probability of rain: {rain_prob * 100:.2f}%.")



🔎 🌧️ Test Case 1 — Typical Rainy Day
🌧️ Rain expected with 91.00% probability.
🌧️ Estimated rainfall: 15.81 mm

🔎 ☀️ Test Case 2 — Hot, Clear Day
☀️ No rain expected. Probability of rain: 1.00%.

🔎 🌧️ Test Case 3 — Overcast + Low Temp
🌧️ Rain expected with 96.50% probability.
🌧️ Estimated rainfall: 24.68 mm

🔎 🌥️ Test Case 4 — Cloudy But Dry
☀️ No rain expected. Probability of rain: 45.00%.

🔎 🌧️ Test Case 5 — Stormy-Like Pressure Drop
🌧️ Rain expected with 97.00% probability.
🌧️ Estimated rainfall: 42.46 mm

🔎 ☀️ Test Case 6 — Warm, Low Humidity
☀️ No rain expected. Probability of rain: 6.00%.
