In [1]:
import pandas as pd
import numpy as np
import joblib
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
# STEP 1: Load Soil Dataset
soil_df = pd.read_csv("Crop_recommendation.csv", usecols=["N", "P", "K", "ph", "label"])

In [3]:
soil_df.head()

Unnamed: 0,N,P,K,ph,label
0,90,42,43,6.502985,rice
1,85,58,41,7.038096,rice
2,60,55,44,7.840207,rice
3,74,35,40,6.980401,rice
4,78,42,42,7.628473,rice


In [4]:
soil_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   N       2200 non-null   int64  
 1   P       2200 non-null   int64  
 2   K       2200 non-null   int64  
 3   ph      2200 non-null   float64
 4   label   2200 non-null   object 
dtypes: float64(1), int64(3), object(1)
memory usage: 86.1+ KB


In [5]:
soil_df.describe()

Unnamed: 0,N,P,K,ph
count,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,6.46948
std,36.917334,32.985883,50.647931,0.773938
min,0.0,5.0,5.0,3.504752
25%,21.0,28.0,20.0,5.971693
50%,37.0,51.0,32.0,6.425045
75%,84.25,68.0,49.0,6.923643
max,140.0,145.0,205.0,9.935091


In [6]:
soil_df.isnull().sum()

N        0
P        0
K        0
ph       0
label    0
dtype: int64

In [7]:
soil_df["label"].value_counts()

label
rice           100
maize          100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
pomegranate    100
banana         100
mango          100
grapes         100
watermelon     100
muskmelon      100
apple          100
orange         100
papaya         100
coconut        100
cotton         100
jute           100
coffee         100
Name: count, dtype: int64

In [8]:
soil_df["label"].value_counts().count()

np.int64(22)

In [9]:
# STEP 2: Load NASA Weather CSVs
# These CSVs were downloaded separately
df_temp = pd.read_csv("T2M.csv", skiprows=9)
df_humid = pd.read_csv("RH2M.csv", skiprows=9)
df_rain = pd.read_csv("PREC.csv", skiprows=9)

# Rename parameter columns for clarity
df_temp.rename(columns={"T2M": "temperature"}, inplace=True)
df_humid.rename(columns={"RH2M": "humidity"}, inplace=True)
df_rain.rename(columns={"PRECTOTCORR": "rainfall"}, inplace=True)

In [10]:
# STEP 3: Merge Weather Data based on LAT, LON, YEAR, DOY
weather = df_temp.merge(df_humid, on=["LAT", "LON", "YEAR", "DOY"])
weather = weather.merge(df_rain, on=["LAT", "LON", "YEAR", "DOY"])

# Optional: Convert DOY (day of year) to full date
weather["date"] = pd.to_datetime(weather["YEAR"].astype(str), format="%Y") + pd.to_timedelta(weather["DOY"] - 1, unit='D')

# Keep only useful columns
weather = weather[["LAT", "LON", "date", "temperature", "humidity", "rainfall"]]

In [11]:
weather.head()

Unnamed: 0,LAT,LON,date,temperature,humidity,rainfall
0,19.0,83.125,2024-06-01,30.45,65.49,6.81
1,19.0,83.75,2024-06-01,30.83,73.67,10.08
2,19.0,84.375,2024-06-01,29.87,81.79,12.27
3,19.0,85.0,2024-06-01,30.51,85.79,11.11
4,19.0,85.625,2024-06-01,30.36,87.74,10.74


In [12]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90155 entries, 0 to 90154
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   LAT          90155 non-null  float64       
 1   LON          90155 non-null  float64       
 2   date         90155 non-null  datetime64[ns]
 3   temperature  90155 non-null  float64       
 4   humidity     90155 non-null  float64       
 5   rainfall     90155 non-null  float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 4.1 MB


In [13]:
weather.describe()

Unnamed: 0,LAT,LON,date,temperature,humidity,rainfall
count,90155.0,90155.0,90155,90155.0,90155.0,90155.0
mean,23.5,86.875,2024-11-30 00:00:00.000000256,24.331838,71.162525,5.358899
min,19.0,83.125,2024-06-01 00:00:00,-15.92,7.22,0.0
25%,21.0,85.0,2024-08-31 00:00:00,20.7,61.01,0.0
50%,23.5,86.875,2024-11-30 00:00:00,26.73,77.15,0.48
75%,26.0,88.75,2025-03-01 00:00:00,29.12,85.49,5.3
max,28.0,90.625,2025-05-31 00:00:00,41.86,98.98,541.41
std,2.738628,2.338549,,7.444185,18.431963,13.225568


In [14]:
weather.isnull().sum()

LAT            0
LON            0
date           0
temperature    0
humidity       0
rainfall       0
dtype: int64

In [15]:
# STEP 4: Assign Random Weather Samples to Soil Data
sampled_weather = weather.sample(n=len(soil_df), replace=True, random_state=42).reset_index(drop=True)
combined_df = pd.concat([soil_df.reset_index(drop=True), sampled_weather[["temperature", "humidity", "rainfall"]]], axis=1)

In [16]:
combined_df.head()

Unnamed: 0,N,P,K,ph,label,temperature,humidity,rainfall
0,90,42,43,6.502985,rice,23.6,85.42,6.81
1,85,58,41,7.038096,rice,33.34,39.59,4.66
2,60,55,44,7.840207,rice,28.51,76.65,1.99
3,74,35,40,6.980401,rice,16.3,54.0,0.0
4,78,42,42,7.628473,rice,29.54,84.73,18.46


In [17]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   ph           2200 non-null   float64
 4   label        2200 non-null   object 
 5   temperature  2200 non-null   float64
 6   humidity     2200 non-null   float64
 7   rainfall     2200 non-null   float64
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [18]:
combined_df.describe()

Unnamed: 0,N,P,K,ph,temperature,humidity,rainfall
count,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,6.46948,24.280823,71.501009,5.034336
std,36.917334,32.985883,50.647931,0.773938,7.502983,17.775965,11.226611
min,0.0,5.0,5.0,3.504752,-9.64,8.63,0.0
25%,21.0,28.0,20.0,5.971693,20.645,61.5,0.0
50%,37.0,51.0,32.0,6.425045,26.75,77.485,0.59
75%,84.25,68.0,49.0,6.923643,29.12,85.0425,5.0125
max,140.0,145.0,205.0,9.935091,41.57,97.55,159.89


In [19]:
combined_df.isnull().sum()

N              0
P              0
K              0
ph             0
label          0
temperature    0
humidity       0
rainfall       0
dtype: int64

In [20]:
# STEP 5: Split into Train_Test_Split
X = combined_df[["N", "P", "K", "ph", "temperature", "humidity", "rainfall"]]
y = combined_df["label"]

X_train, X_test, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# STEP 6: Encode Labels
le = LabelEncoder()
y_train = le.fit_transform(y_train_raw)
y_test = le.transform(y_test_raw)

# Save the encoder
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

In [22]:
# STEP 7: Model selection
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [23]:
# STEP 8: Evaluate
y_pred = model.predict(X_test)
print("✅ Model Accuracy:", accuracy_score(y_test, y_pred))

✅ Model Accuracy: 0.7886363636363637


In [25]:
# STEP 9: Save Model
joblib.dump(model, "crop_model.pkl")

['crop_model.pkl']