# **1.Import Required Libraries**


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# **2.Load and Inspect the Dataset**

In [None]:
# Load the dataset
data = pd.read_csv('dataset.csv')
# Display the first few rows
print(data.head())
print(data.info())

                    State_Name District_Name  Crop_Year       Season  \
0  Andaman and Nicobar Islands      NICOBARS       2000  Kharif        
1  Andaman and Nicobar Islands      NICOBARS       2000  Kharif        
2  Andaman and Nicobar Islands      NICOBARS       2000  Kharif        
3  Andaman and Nicobar Islands      NICOBARS       2000  Whole Year    
4  Andaman and Nicobar Islands      NICOBARS       2000  Whole Year    

                  Crop    Area   N   P   K  temperature   humidity        ph  \
0             Arecanut  1254.0  90  42  43    20.879744  82.002744  6.502985   
1  Other Kharif pulses     2.0  85  58  41    21.770462  80.319644  7.038096   
2                 Rice   102.0  60  55  44    23.004459  82.320763  7.840207   
3               Banana   176.0  74  35  40    26.491096  80.158363  6.980401   
4            Cashewnut   720.0  78  42  42    20.130175  81.604873  7.628473   

     rainfall Production  
0  202.935536       2000  
1  226.655537          1  
2  26

# **3.Data Cleaning**

In [None]:
# Drop rows with missing values
data = data.dropna()
# Remove rows where 'Production' column contains invalid values ('=')
data = data[data['Production'] != '=']
# Convert 'Production' column to numeric (in case it's still object type)
data['Production'] = pd.to_numeric(data['Production'], errors='coerce')
# Confirm all invalid entries are removed
print(data[data['Production'] == '='])

Empty DataFrame
Columns: [State_Name, District_Name, Crop_Year, Season, Crop, Area, N, P, K, temperature, humidity, ph, rainfall, Production]
Index: []


# **4.Encode Categorical Variables**

In [None]:
# Define categorical columns to encode
categorical_cols = ['State_Name', 'District_Name', 'Crop', 'Season']
label_encoders = {}
# Apply label encoding
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Save encoders in case you need to decode later

# **5.Define Features and Target Variable**

In [None]:
# Select features and target
X = data[['Area', 'Season', 'Crop', 'Crop_Year']]
y = data['Production']

# **6.Split the Dataset for Training and Testing**

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **7.Feature Scaling using StandardScaler**

In [None]:
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **8.Getting the Line equation to predict the outputs using Built in Linear regression module**

In [None]:
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# **9.Make Predictions and Evaluating the accuracy using Mean square error and Mean absolute error**

In [None]:
# Predict on test data
y_pred = model.predict(X_test)
# Evaluate using common regression metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print evaluation results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

Mean Absolute Error: 1509279.3874250825
Mean Squared Error: 291594576461658.3
R-squared Score: 0.0055751132619891175
