In [21]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# Load data
crop_data = pd.read_csv("/content/crop_dataset_cleaned.csv")
agro_data = pd.read_csv("/content/combined_district_data.csv")

# Keep only the crop of interest (Bajra)
crop_name = "Bajra"
crop_df = crop_data[crop_data["crop"].str.lower() == crop_name.lower()]

# Merge crop data with agro data on District (district_name vs District)
agro_df = agro_data.copy()
agro_df.rename(columns={"District":"district_name"}, inplace=True)
full_df = crop_df.merge(agro_df, on="district_name", how="left")

# Sort by district & year for lag features
full_df = full_df.sort_values(by=["district_name", "season", "year"])

# Create lag features (previous year area & yield)
full_df["area_prev_year"] = full_df.groupby(["district_name", "season"])["area"].shift(1)
full_df["yield_prev_year"] = full_df.groupby(["district_name", "season"])["yield"].shift(1)

# Keep only numeric columns for modeling + lag features
numeric_cols = full_df.select_dtypes(include=np.number).columns.tolist()
numeric_cols.remove("production")  # target columns
numeric_cols.remove("yield")
numeric_cols.remove("area")

feature_cols = ["area_prev_year", "yield_prev_year"] + numeric_cols


In [22]:
results = []

districts = full_df["district_name"].unique()
seasons = full_df["season"].unique()

for district in districts:
    for season in seasons:
        # Filter data for district + season
        df_sub = full_df[(full_df["district_name"] == district) &
                         (full_df["season"] == season)]

        # If no historical data, skip
        if df_sub.shape[0] < 2:  # less than 2 years → cannot create lag
            results.append({
                "district_name": district,
                "season": season,
                "area_next_year": 0,
                "yield_next_year": 0,
                "production_next_year": 0,
                "note": "No history"
            })
            continue

        # Prepare features and targets
        X = df_sub[feature_cols]
        y_area = df_sub["area"]
        y_yield = df_sub["yield"]

        # Impute missing values
        imputer = SimpleImputer(strategy="median")
        X_imputed = imputer.fit_transform(X)

        # Train Area model
        rf_area = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
        rf_area.fit(X_imputed, y_area)

        # Train Yield model
        rf_yield = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
        rf_yield.fit(X_imputed, y_yield)

        # Predict next year's features (use last row as proxy)
        X_next = X.iloc[-1:].copy()
        X_next_imputed = imputer.transform(X_next)

        area_pred = rf_area.predict(X_next_imputed)[0]
        yield_pred = rf_yield.predict(X_next_imputed)[0]
        production_pred = area_pred * yield_pred

        results.append({
            "district_name": district,
            "season": season,
            "area_next_year": area_pred,
            "yield_next_year": yield_pred,
            "production_next_year": production_pred,
            "note": "OK"
        })


In [25]:
results = []

for district in districts:
    for season in seasons:
        df_sub = full_df[(full_df["district_name"] == district) &
                         (full_df["season"] == season)]

        # If not enough data or no valid features, skip
        if df_sub.shape[0] < 2:
            results.append({
                "district_name": district,
                "season": season,
                "area_next_year": 0,
                "yield_next_year": 0,
                "production_next_year": 0,
                "note": "No history"
            })
            continue

        # Prepare features
        X = df_sub[feature_cols]
        y_area = df_sub["area"]
        y_yield = df_sub["yield"]

        # Impute missing values
        imputer = SimpleImputer(strategy="median")
        X_imp = imputer.fit_transform(X)

        # Check if after imputation we have any features
        if X_imp.shape[1] == 0:
            results.append({
                "district_name": district,
                "season": season,
                "area_next_year": 0,
                "yield_next_year": 0,
                "production_next_year": 0,
                "note": "No valid features"
            })
            continue

        # Train models
        rf_area = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
        rf_area.fit(X_imp, y_area)

        rf_yield = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
        rf_yield.fit(X_imp, y_yield)

        # Predict next year's features
        X_next = X.iloc[-1:].copy()
        X_next_imp = imputer.transform(X_next)

        area_pred = rf_area.predict(X_next_imp)[0]
        yield_pred = rf_yield.predict(X_next_imp)[0]
        production_pred = area_pred * yield_pred

        results.append({
            "district_name": district,
            "season": season,
            "area_next_year": area_pred,
            "yield_next_year": yield_pred,
            "production_next_year": production_pred,
            "note": "OK"
        })


In [26]:
results_df = pd.DataFrame(results)
results_df.to_csv("bajra_forecast_districtwise.csv", index=False)
print(results_df.head())


     district_name  season  area_next_year  yield_next_year  \
0         Bagalkot  kharif       17474.815          1.51460   
1         Bagalkot    rabi           0.000          0.00000   
2  Bangalore rural  kharif          11.780          1.36670   
3  Bangalore rural    rabi           0.000          0.00000   
4          Belgaum  kharif        8139.260          1.18485   

   production_next_year        note  
0          26467.354799          OK  
1              0.000000  No history  
2             16.099726          OK  
3              0.000000  No history  
4           9643.802211          OK  


In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Lists to store actual and predicted values
actual_area = []
pred_area = []
actual_yield = []
pred_yield = []

# Only include districts/seasons with enough history
for district in districts:
    for season in seasons:
        df_sub = full_df[(full_df["district_name"] == district) &
                         (full_df["season"] == season)]

        # Skip if not enough data
        if df_sub.shape[0] < 2:
            continue

        # Leave-last-year-out
        train_df = df_sub.iloc[:-1]
        test_df = df_sub.iloc[-1:]

        X_train = train_df[feature_cols]
        y_train_area = train_df["area"]
        y_train_yield = train_df["yield"]

        X_test = test_df[feature_cols]
        y_test_area = test_df["area"]
        y_test_yield = test_df["yield"]

        # Impute missing values
        imputer = SimpleImputer(strategy="median")
        X_train_imp = imputer.fit_transform(X_train)
        X_test_imp = imputer.transform(X_test)

        # Skip if no valid features
        if X_train_imp.shape[1] == 0:
            continue

        # Train models
        rf_area = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
        rf_area.fit(X_train_imp, y_train_area)
        rf_yield = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
        rf_yield.fit(X_train_imp, y_train_yield)

        # Predict on last year
        area_pred_val = rf_area.predict(X_test_imp)[0]
        yield_pred_val = rf_yield.predict(X_test_imp)[0]

        # Store actuals and predictions
        actual_area.append(y_test_area.values[0])
        pred_area.append(area_pred_val)
        actual_yield.append(y_test_yield.values[0])
        pred_yield.append(yield_pred_val)

# Compute metrics
mae_area = mean_absolute_error(actual_area, pred_area)
rmse_area = np.sqrt(mean_squared_error(actual_area, pred_area))

mae_yield = mean_absolute_error(actual_yield, pred_yield)
rmse_yield = np.sqrt(mean_squared_error(actual_yield, pred_yield))

# Production metrics
actual_prod = np.array(actual_area) * np.array(actual_yield)
pred_prod = np.array(pred_area) * np.array(pred_yield)
mae_prod = mean_absolute_error(actual_prod, pred_prod)
rmse_prod = np.sqrt(mean_squared_error(actual_prod, pred_prod))

print("=== District-wise Forecast Accuracy (for districts with history) ===")
print(f"Area MAE: {mae_area:.2f} hectares, RMSE: {rmse_area:.2f} hectares")
print(f"Yield MAE: {mae_yield:.2f} t/ha, RMSE: {rmse_yield:.2f} t/ha")
print(f"Production MAE: {mae_prod:.2f} t, RMSE: {rmse_prod:.2f} t")


=== District-wise Forecast Accuracy (for districts with history) ===
Area MAE: 2309.22 hectares, RMSE: 5905.34 hectares
Yield MAE: 0.19 t/ha, RMSE: 0.29 t/ha
Production MAE: 2194.52 t, RMSE: 6093.32 t
