In [5]:
import pandas as pd

# -------------------------------
# 1. Load datasets
# -------------------------------
district_df = pd.read_csv("District_Rainfall_Normal_0.csv")
subdivision_df = pd.read_csv("Sub_Division_IMD_2017.csv")

# Clean column names (remove spaces, uppercase for consistency)
district_df.columns = district_df.columns.str.strip().str.upper()
subdivision_df.columns = subdivision_df.columns.str.strip().str.upper()

print("District dataset columns:", district_df.columns.tolist())
print("Subdivision dataset columns:", subdivision_df.columns.tolist())

# -------------------------------
# 2. Add YEAR column to district_df
# -------------------------------
# If missing YEAR, generate sequential years (assuming normal data is per year)
if "YEAR" not in district_df.columns:
    # Example: suppose the data is from 2000–2009
    years = list(range(2000, 2000 + len(district_df)))
    district_df["YEAR"] = years

# -------------------------------
# 3. Load mapping: DISTRICT → SUBDIVISION
# -------------------------------
# Example mapping (you should expand this file properly for all districts)
mapping_data = {
    "STATE/UT": ["Madhya Pradesh", "Madhya Pradesh", "Madhya Pradesh"],
    "DISTRICT": ["Bhopal", "Indore", "Jabalpur"],
    "SUBDIVISION": ["West Madhya Pradesh", "West Madhya Pradesh", "East Madhya Pradesh"]
}
mapping_df = pd.DataFrame(mapping_data)

# Merge mapping into district_df
district_df = district_df.merge(mapping_df, on=["STATE/UT","DISTRICT"], how="left")

# -------------------------------
# 4. Merge district rainfall with subdivision rainfall by YEAR + SUBDIVISION
# -------------------------------
merged_df = district_df.merge(
    subdivision_df,
    on=["YEAR","SUBDIVISION"],
    suffixes=("_DISTRICT","_SUBDIV")
)

# -------------------------------
# 5. Save merged dataset
# -------------------------------
merged_df.to_csv("merged_rainfall.csv", index=False)
print("✅ Merged dataset saved as merged_rainfall.csv")

# -------------------------------
# 6. Inspect
# -------------------------------
print(merged_df.head())


District dataset columns: ['STATE/UT', 'DISTRICT', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'ANNUAL', 'JAN+FEB', 'MAM', 'JJAS', 'OND']
Subdivision dataset columns: ['SUBDIVISION', 'YEAR', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', 'ANNUAL', 'JF', 'MAM', 'JJAS', 'OND']
✅ Merged dataset saved as merged_rainfall.csv
Empty DataFrame
Columns: [STATE/UT, DISTRICT, JAN_DISTRICT, FEB_DISTRICT, MAR_DISTRICT, APR_DISTRICT, MAY_DISTRICT, JUN_DISTRICT, JUL_DISTRICT, AUG_DISTRICT, SEP_DISTRICT, OCT_DISTRICT, NOV_DISTRICT, DEC_DISTRICT, ANNUAL_DISTRICT, JAN+FEB, MAM_DISTRICT, JJAS_DISTRICT, OND_DISTRICT, YEAR, SUBDIVISION, JAN_SUBDIV, FEB_SUBDIV, MAR_SUBDIV, APR_SUBDIV, MAY_SUBDIV, JUN_SUBDIV, JUL_SUBDIV, AUG_SUBDIV, SEP_SUBDIV, OCT_SUBDIV, NOV_SUBDIV, DEC_SUBDIV, ANNUAL_SUBDIV, JF, MAM_SUBDIV, JJAS_SUBDIV, OND_SUBDIV]
Index: []

[0 rows x 38 columns]


In [6]:
features = merged_df[["JAN_DISTRICT","FEB_DISTRICT","MAR_DISTRICT",
                      "MAM_DISTRICT","JJAS_DISTRICT","OND_DISTRICT","ANNUAL_DISTRICT"]]


In [7]:
def recommend_crop(row):
    if row["JJAS_DISTRICT"] > 800:   # very high monsoon
        return "Rice"
    elif row["OND_DISTRICT"] > 300 and row["ANNUAL_DISTRICT"] > 600:
        return "Wheat"
    elif row["ANNUAL_DISTRICT"] < 500:
        return "Pulses"
    else:
        return "Maize"

merged_df["CROP"] = merged_df.apply(recommend_crop, axis=1)


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = merged_df[["JAN_DISTRICT","FEB_DISTRICT","MAR_DISTRICT",
               "MAM_DISTRICT","JJAS_DISTRICT","OND_DISTRICT","ANNUAL_DISTRICT"]]
y = merged_df["CROP"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.