In [25]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn import linear_model

In [26]:
df = pd.read_csv("rainfall in india 1901-2015.csv", na_values = 'NA')

In [27]:
df.dropna(how = 'any', inplace = True)

In [28]:
df.isnull().sum()

SUBDIVISION    0
YEAR           0
JAN            0
FEB            0
MAR            0
APR            0
MAY            0
JUN            0
JUL            0
AUG            0
SEP            0
OCT            0
NOV            0
DEC            0
ANNUAL         0
Jan-Feb        0
Mar-May        0
Jun-Sep        0
Oct-Dec        0
dtype: int64

In [29]:
df.duplicated().sum()

0

In [30]:
df.shape

(4090, 19)

In [31]:
df['SUBDIVISION'].nunique()

36

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [33]:
feature_columns = ['SUBDIVISION', 'YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
target_column = 'ANNUAL'

In [34]:
# Separate features and target
x = df[feature_columns]
y = df[target_column]

In [35]:
# Preprocessing for numerical and categorical data
numeric_features = ['YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
categorical_features = ['SUBDIVISION']

In [36]:
# Create the preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## XGBoost

In [37]:
from xgboost import XGBRegressor
# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [38]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [39]:
# Train the model
model.fit(x_train, y_train)

In [40]:
# Make predictions
y_pred = model.predict(x_test)

In [41]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 6279.755385872519
Mean Absolute Error: 40.36816693145081
R^2 Score: 0.9927451082776634


In [42]:
y_train_pred = model.predict(x_train)
mse = mean_squared_error(y_train, y_train_pred)
mae = mean_absolute_error(y_train, y_train_pred)
r2 = r2_score(y_train, y_train_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 57.24661370553063
Mean Absolute Error: 5.599610485366037
R^2 Score: 0.9999288227088663


In [43]:
# Predictive Function

In [44]:
new_data = pd.DataFrame({
    'SUBDIVISION': ['ANDAMAN & NICOBAR ISLANDS'],
    'YEAR': [1901],
    'JAN': [49.2],
    'FEB': [87.1],
    'MAR': [29.2],
    'APR': [2.3],
    'MAY': [528.8],
    'JUN': [517.5],
    'JUL': [365.1],
    'AUG': [481.1],
    'SEP': [332.6],
    'OCT': [388.5],
    'NOV': [558.2],
    'DEC': [33.6],
    'Jan-Feb': [136.3],
    'Mar-May': [560.3],
    'Jun-Sep': [1696.3],
    'Oct-Dec': [980.3]
})

# Make predictions using the trained model
predicted_annual_rainfall = model.predict(new_data)

# Print the predicted annual rainfall
print(f'Predicted Annual Rainfall: {predicted_annual_rainfall[0]}')

Predicted Annual Rainfall: 3378.59130859375


In [45]:
new_data = pd.DataFrame({
    'SUBDIVISION': ['LAKSHADWEEP'],
    'YEAR': [2015],
    'JAN': [2.2],
    'FEB': [0.5],
    'MAR': [3.7],
    'APR': [87.1],
    'MAY': [133.1],
    'JUN': [296.6],
    'JUL': [257.5],
    'AUG': [146.4],
    'SEP': [160.4],
    'OCT': [165.4],
    'NOV': [231],
    'DEC': [159],
    'Jan-Feb': [2.7],
    'Mar-May': [223.9],
    'Jun-Sep': [860.9],
    'Oct-Dec': [555.4]
})

# Make predictions using the trained model
predicted_annual_rainfall = model.predict(new_data)

# Print the predicted annual rainfall
print(f'Predicted Annual Rainfall: {predicted_annual_rainfall[0]}')

Predicted Annual Rainfall: 1645.5635986328125


## Saving the model in .sav

In [46]:
import pickle
filename = 'Rainfall_xgboost.sav'
pickle.dump(model, open(filename, 'wb'))

In [47]:
import joblib

# Save
joblib.dump(model, 'Rainfall_xgboost.joblib')

# Load
model = joblib.load('Rainfall_xgboost.joblib')
