In [35]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import linear_model

In [36]:
df = pd.read_csv("rainfall in india 1901-2015.csv", na_values = 'NA')

In [37]:
df.dropna(how = 'any', inplace = True)

In [38]:
df.isnull().sum()

SUBDIVISION    0
YEAR           0
JAN            0
FEB            0
MAR            0
APR            0
MAY            0
JUN            0
JUL            0
AUG            0
SEP            0
OCT            0
NOV            0
DEC            0
ANNUAL         0
Jan-Feb        0
Mar-May        0
Jun-Sep        0
Oct-Dec        0
dtype: int64

In [39]:
df.duplicated().sum()

0

In [40]:
df.shape

(4090, 19)

In [41]:
df['SUBDIVISION'].nunique()

36

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [43]:
feature_columns = ['SUBDIVISION', 'YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
target_column = 'ANNUAL'

In [44]:
# Separate features and target
x = df[feature_columns]
y = df[target_column]

In [45]:
# Preprocessing for numerical and categorical data
numeric_features = ['YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
categorical_features = ['SUBDIVISION']

In [46]:
# Create the preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## Linear Regression

In [47]:
from sklearn.linear_model import LinearRegression
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [48]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [49]:
# Train the model
model.fit(x_train, y_train)

In [50]:
# Make predictions
y_pred = model.predict(x_test)

In [51]:
# Evaluate the model Linear Regression
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [52]:
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.003104792653703146
Mean Absolute Error: 0.04300698447921881
R^2 Score: 0.9999999964130872


In [53]:
y_train_pred = model.predict(x_train)
mse = mean_squared_error(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.0032623619432621716
Mean Absolute Error: 0.043365715411941896
R^2 Score: 0.9999999959437585


In [54]:
new_data = pd.DataFrame({
    'SUBDIVISION': ['ANDAMAN & NICOBAR ISLANDS'],
    'YEAR': [1901],
    'JAN': [49.2],
    'FEB': [87.1],
    'MAR': [29.2],
    'APR': [2.3],
    'MAY': [528.8],
    'JUN': [517.5],
    'JUL': [365.1],
    'AUG': [481.1],
    'SEP': [332.6],
    'OCT': [388.5],
    'NOV': [558.2],
    'DEC': [33.6],
    'Jan-Feb': [136.3],
    'Mar-May': [560.3],
    'Jun-Sep': [1696.3],
    'Oct-Dec': [980.3]
})

# Make predictions using the trained model
predicted_annual_rainfall = model.predict(new_data)

# Print the predicted annual rainfall
print(f'Predicted Annual Rainfall: {predicted_annual_rainfall[0]}')

Predicted Annual Rainfall: 3373.194042014079


In [55]:
new_data = pd.DataFrame({
    'SUBDIVISION': ['LAKSHADWEEP'],
    'YEAR': [2015],
    'JAN': [2.2],
    'FEB': [0.5],
    'MAR': [3.7],
    'APR': [87.1],
    'MAY': [133.1],
    'JUN': [296.6],
    'JUL': [257.5],
    'AUG': [146.4],
    'SEP': [160.4],
    'OCT': [165.4],
    'NOV': [231],
    'DEC': [159],
    'Jan-Feb': [2.7],
    'Mar-May': [223.9],
    'Jun-Sep': [860.9],
    'Oct-Dec': [555.4]
})

# Make predictions using the trained model
predicted_annual_rainfall = model.predict(new_data)

# Print the predicted annual rainfall
print(f'Predicted Annual Rainfall: {predicted_annual_rainfall[0]}')

Predicted Annual Rainfall: 1642.8948855449205


## Decision Tree Regression

In [56]:
from sklearn.tree import DecisionTreeRegressor 
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(criterion="squared_error",max_depth=None))
])

In [57]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [58]:
# Train the model
model.fit(x_train, y_train)

In [59]:
# Make predictions
y_pred = model.predict(x_test)

In [60]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 14334.517334963328
Mean Absolute Error: 73.75330073349633
R^2 Score: 0.9834395824730574


In [61]:
y_train_pred = model.predict(x_train)
mse = mean_squared_error(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.0
Mean Absolute Error: 0.0
R^2 Score: 1.0


## Random Forest Regression

In [62]:
from sklearn.ensemble import RandomForestRegressor
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=50))
])

In [63]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [64]:
# Train the model
model.fit(x_train, y_train)

In [65]:
# Make predictions
y_pred = model.predict(x_test)

In [66]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 7435.897360809284
Mean Absolute Error: 41.512633251833705
R^2 Score: 0.9914094376458608


In [67]:
y_train_pred = model.predict(x_train)
mse = mean_squared_error(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 731.0251971470009
Mean Absolute Error: 15.005184290953528
R^2 Score: 0.9990910834734947


## XGBoost

In [68]:
from xgboost import XGBRegressor
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [69]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [70]:
# Train the model
model.fit(x_train, y_train)

In [71]:
# Make predictions
y_pred = model.predict(x_test)

In [72]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 6279.755385872519
Mean Absolute Error: 40.36816693145081
R^2 Score: 0.9927451082776634


In [73]:
y_train_pred = model.predict(x_train)
mse = mean_squared_error(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 57.24661370553063
Mean Absolute Error: 5.599610485366037
R^2 Score: 0.9999288227088663


# Regression Model Evaluation Summary (R², MSE, MAE)

## 1. Linear Regression
Test R²: 0.9999999964

Test MSE: 0.0031

Test MAE: 0.0430

Linear Regression achieved near-perfect performance on both training and test data. The extremely low error values and R² scores near 1.0 suggest a highly linear relationship between features and the target variable. This model generalizes exceptionally well with no signs of overfitting.

## 2. XGBoost Regressor
Test R²: 0.9927

Test MSE: 6279.76

Test MAE: 40.37

XGBoost showed the best balance between low error metrics and high generalization among ensemble models. The small difference between training and test performance indicates strong regularization and effective learning of complex patterns.

## 3. Random Forest Regressor
Test R²: 0.9914

Test MSE: 7435.90

Test MAE: 41.51

Random Forest also performed well, with high R² and relatively low errors. However, it showed slightly higher test errors than XGBoost, suggesting mild overfitting. Still, it remains a robust and accurate model for regression tasks.

## 4. Decision Tree Regressor
Test R²: 0.9795

Test MSE: 17704.56

Test MAE: 77.94

While Decision Tree achieved a perfect R² on training data, its performance on the test set dropped significantly. The high test MSE and MAE indicate severe overfitting and poor generalization. This model is the least reliable among the four.

# Final Model Selection: XGBoost Regressor

XGBoost Regressor was selected as the final model due to its excellent balance between predictive accuracy and generalization. With the lowest test errors among ensemble models and a high R² score, it is the most reliable and stable model for deployment.