In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/yash9939/Crop_Recomendation_System/refs/heads/main/Dataset/Crop%20recommendation%20dataset.csv")

df

Unnamed: 0,CROPS,TYPE_OF_CROP,SOIL,SEASON,SOWN,HARVESTED,WATER_SOURCE,SOIL_PH,SOIL_PH_HIGH,CROPDURATION,...,WATERREQUIRED,WATERREQUIRED_MAX,RELATIVE_HUMIDITY,RELATIVE_HUMIDITY_MAX,N,N_MAX,P,P_MAX,K,K_MAX
0,rice,cereals,Alluvial soil,kharif,Jun,Sep,irrigated,7.6,8.0,116.9,...,2462.3,2500,73.8,80,82.4,100,40.7,60,42.2,60
1,rice,cereals,Loamy soil,kharif,Jul,Oct,rainfed,6.2,8.0,117.9,...,1237.5,2500,60.9,80,90.5,100,51.3,60,46.2,60
2,rice,cereals,Clay soil,kharif,Jun,Sep,irrigated,6.7,8.0,117.7,...,1075.1,2500,67.5,80,86.2,100,50.7,60,44.4,60
3,rice,cereals,Alluvial soil,kharif,Jul,Oct,rainfed,6.1,8.0,149.8,...,1549.9,2500,73.6,80,91.3,100,51.3,60,44.5,60
4,rice,cereals,Loamy soil,kharif,Jun,Sep,irrigated,8.0,8.0,131.7,...,1306.4,2500,60.3,80,81.3,100,48.6,60,51.0,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56995,small onion,bulbvegetables,sandy Loamy soil,Zaid,Mar,Mar,rainfed,6.2,7.0,86.6,...,701.9,750,66.0,75,73.8,100,44.5,60,57.9,60
56996,small onion,bulbvegetables,sandy Loamy soil,Zaid,Apr,Apr,irrigated,6.4,7.0,70.3,...,653.5,750,66.2,75,72.4,100,50.5,60,55.1,60
56997,small onion,bulbvegetables,sandy Loamy soil,Zaid,May,May,rainfed,6.4,7.0,86.3,...,743.3,750,65.6,75,70.5,100,59.8,60,58.5,60
56998,small onion,bulbvegetables,sandy Loamy soil,Zaid,Jun,Jun,irrigated,6.2,7.0,89.1,...,736.8,750,74.2,75,66.8,100,40.2,60,55.3,60


In [3]:
df["SOIL"] = (
    df["SOIL"]
    .str.lower()
    .str.replace("\xa0", " ", regex=True)
    .str.strip()
)


In [4]:
soil_map = {
    # Alluvial
    "alluvial soil": "Alluvial",

    # Loamy group
    "loamy soil": "Loamy",
    "clay loamy soil": "Loamy",
    "silty loamy soil": "Loamy",
    "sandy loamy soil": "Loamy",
    "red loamy soil": "Loamy",
    "brown loamy soil": "Loamy",
    "light loamy soil": "Loamy",
    "well-drained loamy soil": "Loamy",
    "well-grained deep loamy moist soil": "Loamy",
    "rich red loamy soil": "Loamy",
    "red lateritic loamy soil": "Loamy",

    # Clayey
    "clay soil": "Clayey",
    "sandy clay loamy soil": "Clayey",
    "salty clay loamy soil": "Clayey",

    # Sandy
    "sandy soil": "Sandy",

    # Black soil
    "black soil": "Black",
    "shallow black soil": "Black",
    "medium black soil": "Black",
    "heavy black soil": "Black",
    "black cotton soil": "Black",
    "cotton soil": "Black",

    # Red soil
    "red soil": "Red",

    # Laterite
    "laterite soil": "Laterite",

    # Other functional soils
    "well-drained soil": "Loamy",
    "friable soil": "Loamy",
    "deep soil": "Loamy",
    "heavy soil": "Clayey",
    "light soil": "Sandy"
}


In [5]:
df["SOIL_GROUP"] = df["SOIL"].map(soil_map)


In [6]:
soil_map["light soi"] = "Sandy"


In [7]:
df["SOIL_GROUP"] = df["SOIL"].map(soil_map)

df[df["SOIL_GROUP"].isna()]["SOIL"].unique()


array([], dtype=object)

In [8]:
df.drop(columns=["SOIL"], inplace=True)


In [9]:
le = LabelEncoder()
df["CROPS"] = le.fit_transform(df["CROPS"])


In [10]:
df = pd.get_dummies(df, columns=["SOIL_GROUP"])


In [11]:
X = df.drop("CROPS", axis=1)
y = df["CROPS"]


In [12]:
cat_cols = X.select_dtypes(include="object").columns
print(cat_cols)


Index(['TYPE_OF_CROP', 'SEASON', 'SOWN', 'HARVESTED', 'WATER_SOURCE'], dtype='object')


In [13]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


In [10]:
from sklearn.feature_selection import RFE

In [16]:
from sklearn.feature_selection import RFE

rf_base = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rfe = RFE(
    estimator=rf_base,
    n_features_to_select=15
)

X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

selected_features = X_train.columns[rfe.support_]
print("Selected Features:")
print(selected_features)


Selected Features:
Index(['SOIL_PH_HIGH', 'CROPDURATION', 'CROPDURATION_MAX', 'MAX_TEMP',
       'WATERREQUIRED', 'WATERREQUIRED_MAX', 'RELATIVE_HUMIDITY',
       'RELATIVE_HUMIDITY_MAX', 'N_MAX', 'P_MAX', 'K', 'K_MAX',
       'TYPE_OF_CROP_vegetables', 'SEASON_kharif', 'SEASON_rabi'],
      dtype='object')


In [17]:
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_rfe, y_train)

y_pred = rf.predict(X_test_rfe)

print("Accuracy:", accuracy_score(y_test, y_pred)*100)


Accuracy: 100.0


In [18]:
print(X.columns.tolist())


['SOIL_PH', 'SOIL_PH_HIGH', 'CROPDURATION', 'CROPDURATION_MAX', 'TEMP', 'MAX_TEMP', 'WATERREQUIRED', 'WATERREQUIRED_MAX', 'RELATIVE_HUMIDITY', 'RELATIVE_HUMIDITY_MAX', 'N', 'N_MAX', 'P', 'P_MAX', 'K', 'K_MAX', 'SOIL_GROUP_Alluvial', 'SOIL_GROUP_Black', 'SOIL_GROUP_Clayey', 'SOIL_GROUP_Laterite', 'SOIL_GROUP_Loamy', 'SOIL_GROUP_Red', 'SOIL_GROUP_Sandy', 'TYPE_OF_CROP_bulbvegetables', 'TYPE_OF_CROP_cereals', 'TYPE_OF_CROP_colecrops', 'TYPE_OF_CROP_fibre crop', 'TYPE_OF_CROP_millets', 'TYPE_OF_CROP_oil seeds', 'TYPE_OF_CROP_pulses', 'TYPE_OF_CROP_sugar crops', 'TYPE_OF_CROP_vegetables', 'SEASON_kharif', 'SEASON_rabi', 'SOWN_Dec', 'SOWN_Jul', 'SOWN_Jun', 'SOWN_Mar', 'SOWN_May', 'SOWN_Nov', 'SOWN_Oct', 'HARVESTED_Jul', 'HARVESTED_Jun', 'HARVESTED_Mar', 'HARVESTED_May', 'HARVESTED_Oct', 'HARVESTED_Sep', 'WATER_SOURCE_rainfed']


In [19]:
drop_cols = [
    "SOIL_PH_HIGH",
    "CROPDURATION",
    "CROPDURATION_MAX",
    "WATERREQUIRED",
    "WATERREQUIRED_MAX",
    "N_MAX",
    "P_MAX",
    "K_MAX",
    "RELATIVE_HUMIDITY_MAX",
    "MAX_TEMP"
]

X = X.drop(columns=drop_cols, errors="ignore")


In [20]:
X = X.loc[:, ~X.columns.str.startswith("TYPE_OF_CROP_")]


In [21]:
X = X.loc[:, ~X.columns.str.startswith("SOWN_")]
X = X.loc[:, ~X.columns.str.startswith("HARVESTED_")]


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_leaf=15,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred) * 100)


NameError: name 'X_train' is not defined

In [None]:
rf = RandomForestClassifier(
    n_estimators=150,        
    max_depth=10,            
    min_samples_leaf=25,     
    min_samples_split=30,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred) * 100)


Accuracy: 93.32163742690058


In [None]:
rf = RandomForestClassifier(
    n_estimators=120,       
    max_depth=8,            
    min_samples_leaf=40,    
    min_samples_split=50,
    max_features=0.6,       
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred) * 100)


Accuracy: 89.58479532163743


In [None]:
rf = RandomForestClassifier(
    n_estimators=100,        
    max_depth=8,             
    min_samples_leaf=40,    
    min_samples_split=50,
    max_features=0.6,        
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred) * 100)


Accuracy: 89.39181286549707


In [29]:
print("Number of features used:", X.shape[1])



Number of features used: 16


In [2]:
import joblib
#joblib.dump(rf, "crop_rf_model.pkl")


In [None]:
joblib.dump(X.columns.tolist(), "model_features.pkl")


In [31]:
joblib.dump(le, "crop_label_encoder.pkl")


['crop_label_encoder.pkl']

In [32]:
joblib.dump(X.columns.tolist(), "model_features.pkl")


['model_features.pkl']

In [13]:
rf = joblib.load("crop_rf_model.pkl")
le = joblib.load("crop_label_encoder.pkl")
features = joblib.load("model_features.pkl")


In [14]:
features = joblib.load("model_features.pkl")
print(features)


['SOIL_PH', 'TEMP', 'RELATIVE_HUMIDITY', 'N', 'P', 'K', 'SOIL_GROUP_Alluvial', 'SOIL_GROUP_Black', 'SOIL_GROUP_Clayey', 'SOIL_GROUP_Laterite', 'SOIL_GROUP_Loamy', 'SOIL_GROUP_Red', 'SOIL_GROUP_Sandy', 'SEASON_kharif', 'SEASON_rabi', 'WATER_SOURCE_rainfed']


In [None]:

input_data = pd.DataFrame(
    np.zeros((1, len(features))),
    columns=features
)


In [16]:
input_data["SOIL_PH"] = 6.5
input_data["N"] = 80
input_data["P"] = 40
input_data["K"] = 35
input_data["TEMP"] = 28
input_data["RELATIVE_HUMIDITY"] = 70

input_data["SOIL_GROUP_Loamy"] = 1
input_data["SEASON_kharif"] = 1
input_data["WATER_SOURCE_rainfed"] = 1


In [17]:
pred = rf.predict(input_data)
crop = le.inverse_transform(pred)

print("Recommended Crop:", crop[0])


Recommended Crop: jute


In [19]:
probs = rf.predict_proba(input_data)[0]
top_idx = np.argsort(probs)[::-1][:3]

top_crops = pd.DataFrame({
    "Crop": le.inverse_transform(top_idx),
    "Suitability_Score": probs[top_idx]
})

top_crops


Unnamed: 0,Crop,Suitability_Score
0,jute,0.35269
1,soyabean,0.248195
2,rice,0.101416
