In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb 
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from xgboost import XGBRegressor
from sklearn.cluster import KMeans
from sklearn.model_selection import TimeSeriesSplit
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv("../DataSets/rainfaLLIndia.csv")

In [None]:
df

In [None]:
df['JUN-SEP'] = df[['JUN', 'JUL', 'AUG', 'SEP']].mean(axis=1)
df['YoY_CHANGE'] = df.groupby('subdivision')['JUN-SEP'].diff()
df['LAG1'] = df.groupby('subdivision')['JUN-SEP'].shift(1)


In [None]:
df.dtypes

In [None]:
label=LabelEncoder()

In [None]:
df.subdivision=label.fit_transform(df.subdivision).astype('int64')

In [None]:
df.isnull().sum()

In [None]:
df.YoY_CHANGE.mean()

In [None]:
df.YoY_CHANGE=df.YoY_CHANGE.fillna(211)

In [None]:
df.LAG1.mean()

In [None]:
df.LAG1=df.LAG1.fillna(264)

# Phase2 :EDA

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=df.groupby('YEAR')['JUN-SEP'].mean().reset_index(), x='YEAR', y='JUN-SEP')
plt.title("Average Monsoon Rainfall in India Over Years")
plt.show()


In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df['JUN-SEP'], bins=30, kde=True)
plt.title("Distribution of Monsoon Rainfall")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(data=df[['JUN', 'JUL', 'AUG', 'SEP']])
plt.title("Monthly Rainfall Distribution (All Subdivisions)")
plt.show()


In [None]:
monsoon_df = df[['JUN', 'JUL', 'AUG', 'SEP']]
corr = monsoon_df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix (Monsoon Months)")
plt.show()


# Phase 3: Machine Learning - Rainfall Prediction

In [None]:
X = df.drop(['LAG1'],axis=1).values
y=df['LAG1'].values


In [None]:
tscv = TimeSeriesSplit(n_splits=5,test_size=30)
results = []
for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
    xtrain, xtest = X[train_idx], X[test_idx]
    ytrain, ytest = y[train_idx], y[test_idx]    

In [None]:
scaler = StandardScaler()    

In [None]:
xtrain.shape,ytrain.shape

In [None]:
x_train = scaler.fit_transform(xtrain)
x_test = scaler.transform(xtest)

In [None]:
model = LinearRegression()
model.fit(x_train,ytrain)
    

In [None]:
ypred = model.predict(x_test)

In [None]:
rmse = np.sqrt(mean_squared_error(ytest, ypred))
mae = mean_absolute_error(ytest, ypred)
r2 = r2_score(ytest, ypred)

In [None]:
rmse,mae,r2

# Random Forest

In [None]:
model_Forest=RandomForestRegressor()
model_Forest.fit(x_train,ytrain)
RandomForestRegressor()

# Testing Accuracy

In [None]:
ypred = model_Forest.predict(x_test)
r2 = r2_score(ytest,ypred)
r2*100

# Training Accuracy

In [None]:
ypred = model_Forest.predict(x_train)
r2 = r2_score(ytrain,ypred)
r2*100

# Xgboost

In [None]:
modelx=XGBRegressor()

In [None]:
modelx.fit(x_train,ytrain)

# Testing

In [None]:
ypred_xg = modelx.predict(x_test)
r2_xg = r2_score(ytest,ypred_xg)
r2*100

# Training Accuracy¶

In [None]:
ypred_xg = model_Forest.predict(x_train)
r2_xg= r2_score(ytrain,ypred)
r2*100

#  Plot Actual vs Predicted


In [None]:
plt.figure(figsize=(28, 8))
plt.subplot(3, 2, fold + 2)
plt.plot(ytest, label='Actual', marker='o')
plt.plot(ypred, label='Predicted', marker='x')
plt.title(f'Fold {fold+1} — RMSE: {rmse:.2f}, R²: {r2:.2f}')
plt.xlabel("Sample Index")
plt.ylabel("Rainfall (JUN-SEP)")
plt.legend()

plt.tight_layout()
plt.suptitle("Predictions vs Actuals for Each Fold", fontsize=16, y=1.02)
plt.show()

# Phase 4 Advance Analysis


#  Data for Clustering

In [None]:
subdiv_avg = df.groupby('subdivision')['JUN-SEP'].mean().reset_index()
X_clust = subdiv_avg[['JUN-SEP']]


# K-Means

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
subdiv_avg['Cluster'] = kmeans.fit_predict(X_clust)


#  Visualize the Clusters

In [None]:
subdiv_avg_sorted = subdiv_avg.sort_values(by='JUN-SEP')
plt.figure(figsize=(14, 6))
sns.barplot(x='subdivision', y='JUN-SEP', hue='Cluster', data=subdiv_avg_sorted, dodge=False, palette='Set2')
plt.xticks(rotation=90)
plt.title("Subdivision Clusters Based on Average Monsoon Rainfall")
plt.ylabel("Avg JUN-SEP Rainfall (mm)")
plt.tight_layout()
plt.show()


# Plot with linear

In [None]:
df_avg_year = df.groupby('YEAR')['JUN-SEP'].mean().reset_index()
sns.lmplot(data=df_avg_year, x='YEAR', y='JUN-SEP', aspect=2, height=6)
plt.title("🇮🇳 All-India Average Monsoon Rainfall Trend")
plt.ylabel("Avg Rainfall (JUN-SEP)")
plt.xlabel("Year")
plt.grid(True)
plt.show()


# Plot with Random Forest

In [None]:
df_rf = df_avg_year.copy()
X_rf = df_rf[['YEAR']].values
y_rf = df_rf['JUN-SEP'].values
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_rf, y_rf)
df_rf['Predicted'] = rf.predict(X_rf)
plt.figure(figsize=(12, 6))
plt.plot(df_rf['YEAR'], df_rf['JUN-SEP'], label='Actual', marker='o')
plt.plot(df_rf['YEAR'], df_rf['Predicted'], label='Random Forest Prediction', marker='x')
plt.title("🇮🇳 All-India Avg Monsoon Rainfall Trend (Random Forest)")
plt.xlabel("Year")
plt.ylabel("Avg Rainfall (JUN-SEP)")
plt.grid(True)
plt.legend()
plt.show()
