<a href="https://colab.research.google.com/github/surabhipandey18/FoodAgri/blob/main/crop_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import sklearn as sk
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Hackathon/Crop Prediction dataset.csv')
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Temperature,Humidity,Soil_Moisture,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,36,35,45,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,37,40,46,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,36,41,50,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,37,42,55,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,36,40,54,720.0,165.0


In [3]:
df.dropna(inplace=True)

In [4]:
df = df.drop('Crop_Year', axis = 1)
df.head()

Unnamed: 0,State_Name,District_Name,Season,Crop,Temperature,Humidity,Soil_Moisture,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,Kharif,Arecanut,36,35,45,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,Kharif,Other Kharif pulses,37,40,46,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,Kharif,Rice,36,41,50,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,Whole Year,Banana,37,42,55,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,Whole Year,Cashewnut,36,40,54,720.0,165.0


In [11]:
categorical_col = ['State_Name', 'District_Name', 'Season', 'Crop']
numerical_col = ['Temperature', 'Humidity', 'Soil_Moisture', 'Area']
target = 'Production'

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_col)
    ],
    remainder='passthrough')

In [14]:
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


In [15]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model)
])

In [17]:
X = df[categorical_col + numerical_col]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [19]:
y_pred = pipeline.predict(X_test)

In [23]:
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

RMSE: 12537556144597.45
R² Score: 0.78


In [26]:
new_data = pd.DataFrame({'State_Name': ['Chhattisgarh'],
    'District_Name': ['JANJGIR-CHAMPA'],
    'Season': ['Kharif'],
    'Crop': ['Sunflower'],
    'Temperature': [36],
    'Humidity': [41],
    'Soil_Moisture' : [50],
    'Area' : [450.0]  # Replace with a relevant area value
}
   )

In [28]:
yield_prediction = pipeline.predict(new_data)[0]
    # Clip the prediction at 0
yield_prediction = max(0, yield_prediction)
print(f"Predicted Yield: {yield_prediction:.2f} tons/hectare")

Predicted Yield: 0.00 tons/hectare


In [31]:
import joblib

joblib.dump(pipeline, 'crop_prediction_pipeline.pkl')

print("Pipeline saved successfully as 'crop_prediction_pipeline.pkl'")

Pipeline saved successfully as 'crop_prediction_pipeline.pkl'
