In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Step 1: Load the dataset
df = pd.read_csv("water_shortage_balanced.csv")
df.head()

Unnamed: 0,State,Year,Population (millions),Annual Rainfall (mm),Agricultural Land (sq km),Industrial Water Usage (billion liters),Domestic Water Usage (billion liters),Groundwater Level (meters below surface),Reservoir Capacity (%),Total Water Consumption (billion liters),Water Shortage Risk
0,Tamil Nadu,2013,179.34,997.0,51865.1,143.82,240.53,35.0,16.7,384.35,Yes
1,Bihar,2020,29.93,2576.9,74682.9,349.2,60.57,3.5,74.7,409.77,No
2,Andhra Pradesh,2004,160.14,732.7,131604.3,367.33,96.27,31.7,70.7,463.6,No
3,Chhattisgarh,2002,94.05,868.2,146354.3,378.86,244.61,38.2,5.1,623.47,Yes
4,Madhya Pradesh,2009,60.95,957.0,79678.0,259.11,139.69,43.1,7.6,398.8,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   State                                     5000 non-null   object 
 1   Year                                      5000 non-null   int64  
 2   Population (millions)                     5000 non-null   float64
 3   Annual Rainfall (mm)                      5000 non-null   float64
 4   Agricultural Land (sq km)                 5000 non-null   float64
 5   Industrial Water Usage (billion liters)   5000 non-null   float64
 6   Domestic Water Usage (billion liters)     5000 non-null   float64
 7   Groundwater Level (meters below surface)  5000 non-null   float64
 8   Reservoir Capacity (%)                    5000 non-null   float64
 9   Total Water Consumption (billion liters)  5000 non-null   float64
 10  Water Shortage Risk                 

In [4]:
df.describe()

Unnamed: 0,Year,Population (millions),Annual Rainfall (mm),Agricultural Land (sq km),Industrial Water Usage (billion liters),Domestic Water Usage (billion liters),Groundwater Level (meters below surface),Reservoir Capacity (%),Total Water Consumption (billion liters)
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2012.295,124.019458,1680.06072,96838.50242,275.012536,156.48396,31.40428,39.51038,431.496496
std,7.853452,73.625661,1074.138495,54274.062927,141.485014,83.739021,13.057648,28.199296,155.727042
min,2000.0,2.06,500.8,1000.7,1.11,0.72,1.0,5.0,16.59
25%,2005.0,55.14,805.1,52936.7,153.3675,92.98,22.0,16.7,334.125
50%,2012.0,121.96,1130.75,92203.3,276.72,155.675,35.0,28.2,417.49
75%,2019.0,191.5575,2589.175,139033.325,399.725,232.6325,41.9,62.825,530.63
max,2025.0,249.98,3996.1,199934.5,499.7,299.96,49.9,99.9,796.24


In [5]:
df.isnull().sum()

Unnamed: 0,0
State,0
Year,0
Population (millions),0
Annual Rainfall (mm),0
Agricultural Land (sq km),0
Industrial Water Usage (billion liters),0
Domestic Water Usage (billion liters),0
Groundwater Level (meters below surface),0
Reservoir Capacity (%),0
Total Water Consumption (billion liters),0


In [6]:
# Step 2: Preprocess Data
# Encode categorical column 'State'
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['State'] = label_encoder.fit_transform(df['State'])

In [7]:
# Step 2: Preprocess Data
# Encode categorical column 'Water Shortage Risk'
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Water Shortage Risk'] = label_encoder.fit_transform(df['Water Shortage Risk'])

In [8]:
# Drop Year column as it is not needed
df.drop(columns=["Year"],inplace=True)


In [9]:
df.head()

Unnamed: 0,State,Population (millions),Annual Rainfall (mm),Agricultural Land (sq km),Industrial Water Usage (billion liters),Domestic Water Usage (billion liters),Groundwater Level (meters below surface),Reservoir Capacity (%),Total Water Consumption (billion liters),Water Shortage Risk
0,22,179.34,997.0,51865.1,143.82,240.53,35.0,16.7,384.35,1
1,3,29.93,2576.9,74682.9,349.2,60.57,3.5,74.7,409.77,0
2,0,160.14,732.7,131604.3,367.33,96.27,31.7,70.7,463.6,0
3,4,94.05,868.2,146354.3,378.86,244.61,38.2,5.1,623.47,1
4,12,60.95,957.0,79678.0,259.11,139.69,43.1,7.6,398.8,1


In [10]:
# Step 3: Split features and target variable
from sklearn.model_selection import train_test_split
x = df.drop(columns=["Water Shortage Risk"])  # Features
y = df["Water Shortage Risk"]  # Target variable

In [11]:
# Step 4: Split data into training and testing sets (80% train, 20% test)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
# Step 5: Normalize the data to improve model performance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)



In [13]:
# Step 6: Train the Logistic Regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(xtrain, ytrain)




In [14]:
# Step 7: Make predictions
ypred = model.predict(xtest)



In [15]:
# Step 8: Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy = accuracy_score(ytest, ypred)
conf_matrix = confusion_matrix(ytest, ypred)
class_report = classification_report(ytest, ypred)

# Print results
print("Model Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Model Accuracy: 0.981
Confusion Matrix:
 [[589  19]
 [  0 392]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98       608
           1       0.95      1.00      0.98       392

    accuracy                           0.98      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.98      0.98      0.98      1000

