In [57]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pickle

# 1. Load and clean the dataset
df = pd.read_csv("weather_hourly.csv")

# 2. Create binary label for rain
df['Rain'] = df['Precip Type'].apply(lambda x: 1 if str(x).lower() == 'rain' else 0)

# 3. Drop unnecessary columns
df.drop(columns=['Formatted Date', 'Summary', 'Precip Type', 'Daily Summary', 'Loud Cover', 'Wind Bearing (degrees)'], inplace=True)

# 4. Drop missing values (optional depending on dataset)
df.dropna(inplace=True)

# 5. Define features and label
features = ['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 
            'Wind Speed (km/h)', 'Pressure (millibars)', 'Visibility (km)']
label = 'Rain'

print("📊 Original class counts:\n", df[label].value_counts())

# 6. Balance the dataset using undersampling
rain_df = df[df[label] == 1].sample(n=10712, random_state=42)
norain_df = df[df[label] == 0]  # already 10712

balanced_df = pd.concat([rain_df, norain_df]).sample(frac=1, random_state=42)  # shuffle

print("\n✅ Balanced class counts:\n", balanced_df[label].value_counts())

# 7. Prepare training data
X = balanced_df[features]
y = balanced_df[label]

# 8. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 9. Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 10. Evaluate model
y_pred = model.predict(X_test)
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 11. Save model
with open("model/rain_classifier.pkl", "wb") as f:
    pickle.dump(model, f)
print("\n✅ Model trained and saved to model/rain_classifier.pkl")

# 12. Test predictions
test_cases = [
    {'Temperature (C)': 30, 'Apparent Temperature (C)': 32, 'Humidity': 0.85, 'Wind Speed (km/h)': 10, 'Pressure (millibars)': 1012, 'Visibility (km)': 8},
    {'Temperature (C)': 24, 'Apparent Temperature (C)': 24, 'Humidity': 0.4, 'Wind Speed (km/h)': 5, 'Pressure (millibars)': 1020, 'Visibility (km)': 15},
    {'Temperature (C)': 19, 'Apparent Temperature (C)': 18, 'Humidity': 0.9, 'Wind Speed (km/h)': 15, 'Pressure (millibars)': 1005, 'Visibility (km)': 5},
    {'Temperature (C)': 10, 'Apparent Temperature (C)': 9, 'Humidity': 0.3, 'Wind Speed (km/h)': 5, 'Pressure (millibars)': 1025, 'Visibility (km)': 16},
    {'Temperature (C)': 22, 'Apparent Temperature (C)': 22, 'Humidity': 0.75, 'Wind Speed (km/h)': 12, 'Pressure (millibars)': 1008, 'Visibility (km)': 7},
    {'Temperature (C)': 28, 'Apparent Temperature (C)': 29, 'Humidity': 0.2, 'Wind Speed (km/h)': 2, 'Pressure (millibars)': 1030, 'Visibility (km)': 18},
    {'Temperature (C)': 15, 'Apparent Temperature (C)': 13, 'Humidity': 0.95, 'Wind Speed (km/h)': 5, 'Pressure (millibars)': 1003, 'Visibility (km)': 6},
    {'Temperature (C)': 33, 'Apparent Temperature (C)': 35, 'Humidity': 0.5, 'Wind Speed (km/h)': 20, 'Pressure (millibars)': 1010, 'Visibility (km)': 10},
    {'Temperature (C)': 17, 'Apparent Temperature (C)': 16, 'Humidity': 0.8, 'Wind Speed (km/h)': 8, 'Pressure (millibars)': 1015, 'Visibility (km)': 9},
    {'Temperature (C)': 25, 'Apparent Temperature (C)': 25, 'Humidity': 0.6, 'Wind Speed (km/h)': 10, 'Pressure (millibars)': 1022, 'Visibility (km)': 12},
    {'Temperature (C)': 36, 'Apparent Temperature (C)': 37, 'Humidity': 0.1, 'Wind Speed (km/h)': 2, 'Pressure (millibars)': 1040, 'Visibility (km)': 20},
    {'Temperature (C)': 38, 'Apparent Temperature (C)': 39, 'Humidity': 0.15, 'Wind Speed (km/h)': 6, 'Pressure (millibars)': 1035, 'Visibility (km)': 22},
    {'Temperature (C)': 21, 'Apparent Temperature (C)': 20, 'Humidity': 0.3, 'Wind Speed (km/h)': 8, 'Pressure (millibars)': 1025, 'Visibility (km)': 19},
]

print("\n🌦 Prediction Results:")
for i, case in enumerate(test_cases, 1):
    prediction = model.predict(pd.DataFrame([case]))[0]
    status = "🌧 Rain Likely" if prediction == 1 else "☀️ No Rain"
    print(f"Test Case {i}: {case} → Prediction: {status}")


📊 Original class counts:
 Rain
1    85224
0    11229
Name: count, dtype: int64

✅ Balanced class counts:
 Rain
0    11229
1    10712
Name: count, dtype: int64

📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      2207
           1       0.98      1.00      0.99      2182

    accuracy                           0.99      4389
   macro avg       0.99      0.99      0.99      4389
weighted avg       0.99      0.99      0.99      4389


🧩 Confusion Matrix:
 [[2153   54]
 [   3 2179]]

✅ Model trained and saved to model/rain_classifier.pkl

🌦 Prediction Results:
Test Case 1: {'Temperature (C)': 30, 'Apparent Temperature (C)': 32, 'Humidity': 0.85, 'Wind Speed (km/h)': 10, 'Pressure (millibars)': 1012, 'Visibility (km)': 8} → Prediction: 🌧 Rain Likely
Test Case 2: {'Temperature (C)': 24, 'Apparent Temperature (C)': 24, 'Humidity': 0.4, 'Wind Speed (km/h)': 5, 'Pressure (millibars)': 1020, 'Visibility (km)': 15} → Pr

In [58]:
# Sample edge cases
test_cases = [
    {'Temperature (C)': 40, 'Apparent Temperature (C)': 42, 'Humidity': 0.1, 'Wind Speed (km/h)': 3, 'Pressure (millibars)': 1040, 'Visibility (km)': 20},  # Likely No Rain
    {'Temperature (C)': 18, 'Apparent Temperature (C)': 18, 'Humidity': 0.9, 'Wind Speed (km/h)': 10, 'Pressure (millibars)': 1002, 'Visibility (km)': 4},  # Likely Rain
]

for i, case in enumerate(test_cases, 1):
    pred = model.predict(pd.DataFrame([case]))[0]
    label = "🌧 Rain Likely" if pred == 1 else "☀️ No Rain Likely"
    print(f"Test Case {i}: {label}")


Test Case 1: 🌧 Rain Likely
Test Case 2: 🌧 Rain Likely
