In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

In [12]:
# Load the dataset
data = pd.read_csv('Bengaluru2009-2019.csv')

#Displaying first 5 rows
data.head()

Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,moonrise,moonset,...,WindChillC,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph
0,01-01-2009 00:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,18,11,2,91,0.0,1014,14,10,109,8
1,01-01-2009 01:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,17,9,2,93,0.0,1014,14,7,85,6
2,01-01-2009 02:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,16,7,2,94,0.0,1014,13,5,61,4
3,01-01-2009 03:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,15,5,2,96,0.0,1014,12,2,37,3
4,01-01-2009 04:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,18,5,1,88,0.0,1015,14,5,45,3


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96432 entries, 0 to 96431
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date_time          96432 non-null  object 
 1   maxtempC           96432 non-null  int64  
 2   mintempC           96432 non-null  int64  
 3   totalSnow_cm       96432 non-null  int64  
 4   sunHour            96432 non-null  float64
 5   uvIndex            96432 non-null  int64  
 6   uvIndex.1          96432 non-null  int64  
 7   moon_illumination  96432 non-null  int64  
 8   moonrise           96432 non-null  object 
 9   moonset            96432 non-null  object 
 10  sunrise            96432 non-null  object 
 11  sunset             96432 non-null  object 
 12  DewPointC          96432 non-null  int64  
 13  FeelsLikeC         96432 non-null  int64  
 14  HeatIndexC         96432 non-null  int64  
 15  WindChillC         96432 non-null  int64  
 16  WindGustKmph       964

In [14]:
# Convert date_time to datetime
data['date_time'] = pd.to_datetime(data['date_time'], format='%d-%m-%Y %H:%M')

In [16]:
#display first 5 rows
data.head()

Unnamed: 0,date_time,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,uvIndex.1,moon_illumination,moonrise,moonset,...,WindChillC,WindGustKmph,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph
0,2009-01-01 00:00:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,18,11,2,91,0.0,1014,14,10,109,8
1,2009-01-01 01:00:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,17,9,2,93,0.0,1014,14,7,85,6
2,2009-01-01 02:00:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,16,7,2,94,0.0,1014,13,5,61,4
3,2009-01-01 03:00:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,15,5,2,96,0.0,1014,12,2,37,3
4,2009-01-01 04:00:00,27,12,0,11.6,5,1,31,9:58 AM,10:03 PM,...,18,5,1,88,0.0,1015,14,5,45,3


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96432 entries, 0 to 96431
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date_time          96432 non-null  datetime64[ns]
 1   maxtempC           96432 non-null  int64         
 2   mintempC           96432 non-null  int64         
 3   totalSnow_cm       96432 non-null  int64         
 4   sunHour            96432 non-null  float64       
 5   uvIndex            96432 non-null  int64         
 6   uvIndex.1          96432 non-null  int64         
 7   moon_illumination  96432 non-null  int64         
 8   moonrise           96432 non-null  object        
 9   moonset            96432 non-null  object        
 10  sunrise            96432 non-null  object        
 11  sunset             96432 non-null  object        
 12  DewPointC          96432 non-null  int64         
 13  FeelsLikeC         96432 non-null  int64         
 14  HeatIn

In [21]:
# Aggregate hourly data to daily data
daily_data = data.resample('D', on='date_time').agg({
    'precipMM': 'sum',
    'maxtempC': 'mean',
    'mintempC': 'mean',
    'humidity': 'mean',
    'pressure': 'mean',
    'tempC': 'mean',
    'visibility': 'mean',
    'windspeedKmph': 'mean',
    'winddirDegree': 'mean',
    'cloudcover': 'mean',
    'WindGustKmph': 'mean',
    'WindChillC': 'mean',
    'totalSnow_cm': 'mean',
    'sunHour': 'mean',
    'uvIndex': 'mean'
}).reset_index()

In [23]:
daily_data.head()

Unnamed: 0,date_time,precipMM,maxtempC,mintempC,humidity,pressure,tempC,visibility,windspeedKmph,winddirDegree,cloudcover,WindGustKmph,WindChillC,totalSnow_cm,sunHour,uvIndex
0,2009-01-01,0.0,27.0,12.0,66.5,1013.875,19.041667,8.583333,6.708333,84.208333,6.958333,11.041667,21.5,0.0,11.6,5.0
1,2009-01-02,0.0,27.0,16.0,69.916667,1014.333333,20.625,7.833333,7.958333,99.291667,36.416667,12.125,21.208333,0.0,11.6,5.0
2,2009-01-03,0.0,25.0,15.0,71.0,1015.0,20.166667,8.083333,8.333333,99.416667,42.0,12.0,20.625,0.0,11.6,5.0
3,2009-01-04,0.0,27.0,15.0,67.291667,1014.25,20.791667,9.0,7.625,83.833333,38.166667,11.541667,21.416667,0.0,11.6,6.0
4,2009-01-05,0.0,28.0,16.0,67.083333,1012.708333,20.666667,8.75,7.5,104.875,25.125,11.416667,21.625,0.0,11.6,5.0


In [27]:
# Create new features
daily_data['year'] = daily_data['date_time'].dt.year
daily_data['month'] = daily_data['date_time'].dt.month
daily_data['day'] = daily_data['date_time'].dt.day
daily_data['is_rainy'] = daily_data['precipMM'] > 0

In [29]:
daily_data.head()

Unnamed: 0,date_time,precipMM,maxtempC,mintempC,humidity,pressure,tempC,visibility,windspeedKmph,winddirDegree,cloudcover,WindGustKmph,WindChillC,totalSnow_cm,sunHour,uvIndex,year,month,day,is_rainy
0,2009-01-01,0.0,27.0,12.0,66.5,1013.875,19.041667,8.583333,6.708333,84.208333,6.958333,11.041667,21.5,0.0,11.6,5.0,2009,1,1,False
1,2009-01-02,0.0,27.0,16.0,69.916667,1014.333333,20.625,7.833333,7.958333,99.291667,36.416667,12.125,21.208333,0.0,11.6,5.0,2009,1,2,False
2,2009-01-03,0.0,25.0,15.0,71.0,1015.0,20.166667,8.083333,8.333333,99.416667,42.0,12.0,20.625,0.0,11.6,5.0,2009,1,3,False
3,2009-01-04,0.0,27.0,15.0,67.291667,1014.25,20.791667,9.0,7.625,83.833333,38.166667,11.541667,21.416667,0.0,11.6,6.0,2009,1,4,False
4,2009-01-05,0.0,28.0,16.0,67.083333,1012.708333,20.666667,8.75,7.5,104.875,25.125,11.416667,21.625,0.0,11.6,5.0,2009,1,5,False


In [33]:
# Save the dataframe to a new CSV file
daily_data.to_csv('daily_data.csv', index=False)

In [78]:
# Aggregate to monthly data for the first model
monthly_data = daily_data.groupby(['year', 'month']).agg({
    'is_rainy': 'sum', # Number of rainy days
    'maxtempC': 'mean',
    'mintempC': 'mean',
    'humidity': 'mean',
    'pressure': 'mean',
    'tempC': 'mean',
    'visibility': 'mean',
    'windspeedKmph': 'mean',
    'winddirDegree': 'mean',
    'cloudcover': 'mean',
    'WindGustKmph': 'mean',
    'WindChillC': 'mean',
    'totalSnow_cm': 'mean',
    'sunHour': 'mean',
    'uvIndex': 'mean'
}).reset_index()

monthly_data = monthly_data.rename(columns={'is_rainy': 'rainy_days'})

In [80]:
monthly_data.head()
monthly_data.to_csv('new_monthly_data.csv', index=False)

In [41]:
# Monthly data for predicting the number of rainy days
X_monthly = monthly_data.drop(columns=['rainy_days', 'year', 'month'])
y_monthly = monthly_data['rainy_days']

In [42]:
# Daily data for predicting precipitation amount and whether it will rain
X_daily = daily_data.drop(columns=['precipMM', 'date_time', 'is_rainy', 'year', 'month', 'day'])
y_daily_precip = daily_data['precipMM']
y_daily_rain = daily_data['is_rainy']

In [44]:
daily_data.head()

Unnamed: 0,date_time,precipMM,maxtempC,mintempC,humidity,pressure,tempC,visibility,windspeedKmph,winddirDegree,cloudcover,WindGustKmph,WindChillC,totalSnow_cm,sunHour,uvIndex,year,month,day,is_rainy
0,2009-01-01,0.0,27.0,12.0,66.5,1013.875,19.041667,8.583333,6.708333,84.208333,6.958333,11.041667,21.5,0.0,11.6,5.0,2009,1,1,False
1,2009-01-02,0.0,27.0,16.0,69.916667,1014.333333,20.625,7.833333,7.958333,99.291667,36.416667,12.125,21.208333,0.0,11.6,5.0,2009,1,2,False
2,2009-01-03,0.0,25.0,15.0,71.0,1015.0,20.166667,8.083333,8.333333,99.416667,42.0,12.0,20.625,0.0,11.6,5.0,2009,1,3,False
3,2009-01-04,0.0,27.0,15.0,67.291667,1014.25,20.791667,9.0,7.625,83.833333,38.166667,11.541667,21.416667,0.0,11.6,6.0,2009,1,4,False
4,2009-01-05,0.0,28.0,16.0,67.083333,1012.708333,20.666667,8.75,7.5,104.875,25.125,11.416667,21.625,0.0,11.6,5.0,2009,1,5,False


In [47]:
# Split the monthly data
X_train_monthly, X_test_monthly, y_train_monthly, y_test_monthly = train_test_split(X_monthly, y_monthly, test_size=0.2, random_state=42)

In [49]:
# Split the daily data
X_train_daily_precip, X_test_daily_precip, y_train_daily_precip, y_test_daily_precip = train_test_split(X_daily, y_daily_precip, test_size=0.2, random_state=42)
X_train_daily_rain, X_test_daily_rain, y_train_daily_rain, y_test_daily_rain = train_test_split(X_daily, y_daily_rain, test_size=0.2, random_state=42)

In [51]:
# Train the model for monthly rainy days prediction
rf_monthly = RandomForestRegressor(n_estimators=100, random_state=42)
rf_monthly.fit(X_train_monthly, y_train_monthly)

In [53]:
# Train the model for daily precipitation prediction
rf_daily_precip = RandomForestRegressor(n_estimators=100, random_state=42)
rf_daily_precip.fit(X_train_daily_precip, y_train_daily_precip)

In [55]:
# Train the model for daily rain prediction
rf_daily_rain = RandomForestClassifier(n_estimators=100, random_state=42)
rf_daily_rain.fit(X_train_daily_rain, y_train_daily_rain)

In [56]:
# Predict on the test set
y_pred_monthly = rf_monthly.predict(X_test_monthly)
y_pred_daily_precip = rf_daily_precip.predict(X_test_daily_precip)
y_pred_daily_rain = rf_daily_rain.predict(X_test_daily_rain)

In [59]:
# Evaluate monthly model
mse_monthly = mean_squared_error(y_test_monthly, y_pred_monthly)
r2_monthly = r2_score(y_test_monthly, y_pred_monthly)

In [60]:
# Evaluate daily precipitation model
mse_daily_precip = mean_squared_error(y_test_daily_precip, y_pred_daily_precip)
r2_daily_precip = r2_score(y_test_daily_precip, y_pred_daily_precip)


In [62]:
# Evaluate daily rain model
accuracy_daily_rain = accuracy_score(y_test_daily_rain, y_pred_daily_rain)
precision_daily_rain = precision_score(y_test_daily_rain, y_pred_daily_rain)
recall_daily_rain = recall_score(y_test_daily_rain, y_pred_daily_rain)
f1_daily_rain = f1_score(y_test_daily_rain, y_pred_daily_rain)

In [109]:
# Display evaluation metrics
print("MSE_Monthly",mse_monthly)
print("R2_Monthly: ", r2_monthly,"\n") 
print("MSE_Daily: ", mse_daily_precip)
print("R2_Daily:",r2_daily_precip,"\n")
print("Accuracy: " , accuracy_daily_rain)
print("Precision: " , precision_daily_rain)
print("Recall " , recall_daily_rain)
print("F1_Score" , f1_daily_rain)

MSE_Monthly 10.94044814814815
R2_Monthly:  0.8088581052581124 

MSE_Daily:  9.154550735074626
R2_Daily: 0.5337447515055298 

Accuracy:  0.8980099502487562
Precision:  0.8793969849246231
Recall  0.9114583333333334
F1_Score 0.8951406649616368


In [96]:
# # Predict the number of rainy days for a given month
# # Example: March 2010 (3rd month of 2010)
# month_input = monthly_data[(monthly_data['year'] == 2010) & (monthly_data['month'] == 3)].drop(columns=['rainy_days', 'year', 'month'])
# rainy_days_pred = rf_monthly.predict(month_input)

In [103]:
# # Predict the precipitation amount and whether it will rain for a given date
# # Example: January 1, 2010
# date_input = daily_data[(daily_data['year'] == 2010) & (daily_data['month'] == 1) & (daily_data['day'] == 1)].drop(columns=['precipMM', 'date_time', 'is_rainy', 'year', 'month', 'day'])
# precip_amount_pred = rf_daily_precip.predict(date_input)
# will_rain_pred = rf_daily_rain.predict(date_input)

# rainy_days_pred[0], precip_amount_pred[0], will_rain_pred[0]

(4.4, 0.7480000000000002, True)

In [106]:
# # Ensure the date_input contains the correct features and a single row for prediction
# date_input = daily_data[(daily_data['year'] == 2010) & (daily_data['month'] == 1) & (daily_data['day'] == 1)].drop(columns=['precipMM', 'date_time', 'is_rainy', 'year', 'month', 'day'])

# # Check if date_input is not empty
# if not date_input.empty:
#     precip_amount_pred = rf_daily_precip.predict(date_input)
#     will_rain_pred = rf_daily_rain.predict(date_input)
#     rain_prob_pred = rf_daily_rain.predict_proba(date_input)[:, 1]  # Probability of raining

#     print(f"Predicted Precipitation Amount: {precip_amount_pred[0]:.2f} mm")
#     print(f"Probability of Rain: {rain_prob_pred[0]:.4f}")
#     print(f"Will it Rain?: {bool(will_rain_pred[0])}")
#     print("Number of rainy days in month:",rainy_days_pred[0])
# else:
#     print("No data available for the specified date")


Predicted Precipitation Amount: 0.75 mm
Probability of Rain: 0.9700
Will it Rain?: True
Number of rainy days in month: 4.4


In [107]:
# # Check if date_input is not empty
# if not date_input.empty:
#     precip_amount_pred = rf_daily_precip.predict(date_input)
#     will_rain_pred = rf_daily_rain.predict(date_input)

#     precip_amount_pred[0], will_rain_pred[0]
# else:
#     print("No data available for the specified date")

In [124]:
# Function to predict for a specific date
def predict_for_date(year, month, day):
    avg_features = daily_data.drop(columns=['precipMM', 'date_time', 'is_rainy', 'year', 'month', 'day']).mean()
    date_input = pd.DataFrame([avg_features])
    date_input['year'] = year
    date_input['month'] = month
    date_input['day'] = day

    date_input = date_input.drop(columns=['year', 'month', 'day'])

    precip_amount_pred = rf_daily_precip.predict(date_input)
    will_rain_pred = rf_daily_rain.predict(date_input)
    rain_prob_pred = rf_daily_rain.predict_proba(date_input)[:, 1]  # Probability of raining

    return precip_amount_pred[0], rain_prob_pred[0], bool(will_rain_pred[0])

In [125]:
# Function to predict the number of rainy days in a month
def predict_rainy_days_in_month(year, month):
    avg_features = daily_data.drop(columns=['precipMM', 'date_time', 'is_rainy', 'year', 'month', 'day']).mean()
    month_input = pd.DataFrame([avg_features])
    month_input['year'] = year
    month_input['month'] = month

    month_input = month_input.drop(columns=['year', 'month'])

    rainy_days_pred = rf_monthly.predict(month_input)
    return rainy_days_pred[0]

In [126]:
# Predicting for today (e.g., June 21, 2024)
today_precip, today_rain_prob, will_rain_today = predict_for_date(2024, 6, 21)
print(f"Predicted precipitation amount: {today_precip:.2f} mm")
print(f"Probability of rain today: {today_rain_prob:.4f}")
print(f"Will it rain today? {'Yes' if will_rain_today else 'No'}")

Predicted precipitation amount: 1.91 mm
Probability of rain today: 0.9200
Will it rain today? Yes


In [127]:
# Predicting the number of rainy days in a month in 2025 (e.g., March 2025)
rainy_days_march_2025 = predict_rainy_days_in_month(2025, 3)
print(f"Predicted number of rainy days in March 2025: {rainy_days_march_2025:.0f}")

Predicted number of rainy days in March 2025: 20
