# Crop Yield Prediction Model

This notebook trains a Gradient Boosting model to predict crop yields based on environmental and agricultural factors.

## Features Used
- Crop Type (Rice, Wheat, Cotton, Sugarcane, Maize, Soybean, Potato)
- State (Punjab, Haryana, UP, MP, Maharashtra, Karnataka, TN, WB)
- Area (hectares)
- Rainfall (mm)
- Temperature (°C)
- Fertilizer (kg)
- Pesticide (kg)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

## Generate Synthetic Data

In [None]:
np.random.seed(42)
n_samples = 2500

crops = ["rice", "wheat", "cotton", "sugarcane", "maize", "soybean", "potato"]
crop_yield_base = {
    "rice": 3000, "wheat": 3500, "cotton": 1500, "sugarcane": 70000,
    "maize": 4000, "soybean": 2500, "potato": 25000
}
states = ["punjab", "haryana", "up", "mp", "maharashtra", "karnataka", "tn", "wb"]

crop_data = []
for crop in crops:
    base = crop_yield_base[crop]
    for _ in range(n_samples // len(crops)):
        area = np.random.uniform(1, 50)
        rainfall = np.random.uniform(200, 1500)
        temp = np.random.uniform(15, 40)
        fertilizer = np.random.uniform(0, 500)
        pesticide = np.random.uniform(0, 100)
        state = np.random.choice(states)
        
        # Environmental factors affect yield
        rain_factor = min(1.2, max(0.6, rainfall / 800))
        temp_factor = min(1.1, max(0.7, 1 - abs(temp - 25) / 30))
        yield_per_ha = base * rain_factor * temp_factor * (1 + 0.001 * fertilizer) * (1 - 0.001 * pesticide)
        total_yield = yield_per_ha * area + np.random.normal(0, yield_per_ha * 0.1)
        
        crop_data.append({
            "crop": crop,
            "state": state,
            "area": area,
            "rainfall": rainfall,
            "temp": temp,
            "fertilizer": fertilizer,
            "pesticide": pesticide,
            "yield": total_yield
        })

df = pd.DataFrame(crop_data)
crop_map = {crop: i for i, crop in enumerate(crops)}
state_map = {state: i for i, state in enumerate(states)}

df["crop_encoded"] = df["crop"].map(crop_map)
df["state_encoded"] = df["state"].map(state_map)
df.head()

## Train Model

In [None]:
X = df[["crop_encoded", "state_encoded", "area", "rainfall", "temp", "fertilizer", "pesticide"]]
y = df["yield"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = GradientBoostingRegressor(n_estimators=200, max_depth=8, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Model Performance:")
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):,.0f} kg")
print(f"MAE: {mean_absolute_error(y_test, y_pred):,.0f} kg")

## Feature Importance

In [None]:
feature_names = ["Crop Type", "State", "Area", "Rainfall", "Temperature", "Fertilizer", "Pesticide"]
importance = model.feature_importances_

plt.figure(figsize=(10, 6))
sns.barplot(x=importance, y=feature_names, palette="Greens_d")
plt.title("Feature Importance - Crop Yield Prediction")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()