In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('crop_yield.csv')


In [34]:
df.sample(5)

Unnamed: 0,Crop,Season,State,Area,Production,Annual_Rainfall,Yield,Input_Per_Unit_Area,Year_Interval
5618,Cotton(lint),Kharif,Karnataka,713879.0,1398294,1419.1,1.862692,0.16255,2010s
8369,Gram,Rabi,Bihar,80322.0,78593,1353.9,0.918333,0.09922,2000s
14373,Horse-gram,Kharif,Madhya Pradesh,13000.0,3376,1000.7,0.258,0.15824,2010s
13359,Jute,Kharif,Nagaland,3030.0,5770,1350.9,1.902,0.14476,2010s
12001,Groundnut,Kharif,Bihar,812.0,829,1097.1,1.0075,0.16785,2010s


In [3]:
df.head(5)

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [4]:
df['Season'].unique()

array(['Whole Year ', 'Kharif     ', 'Rabi       ', 'Autumn     ',
       'Summer     ', 'Winter     '], dtype=object)

In [5]:
df.shape

(19689, 10)

# Data Preprocessing

In [6]:
df=df[df['Crop_Year']!=2020]  

In [7]:
# removing spaces present in season column
df['Season'] = df['Season'].str.strip()

In [8]:
# Convert Fertilizer and pesticide columns from kgs into tons

df['Fertilizer'] = df['Fertilizer'].apply(lambda x: x/1000)
df['Fertilizer'] = df['Fertilizer'].apply(lambda x: np.round(x,3))

df['Pesticide'] = df['Pesticide'].apply(lambda x: x/1000)
df['Pesticide'] = df['Pesticide'].apply(lambda x: np.round(x,3))

In [9]:
df['Input_Per_Unit_Area'] = (df['Fertilizer'] + df['Pesticide']) / df['Area']

In [10]:
# dropping this row because it's a extreme outlier
df.drop(119, inplace=True)

In [11]:
df.drop(columns=['Fertilizer','Pesticide'], inplace=True)

In [12]:
# Categorize crop_year
bins = [1990, 2000, 2010, 2020]  # Example bins, adjust based on your data range
labels = ['90s', '2000s', '2010s']
df['Year_Interval'] = pd.cut(df['Crop_Year'], bins=bins, labels=labels)

In [13]:
df.drop(columns=['Crop_Year'],inplace=True)

In [14]:
df.shape

(19651, 9)

In [15]:
df.columns

Index(['Crop', 'Season', 'State', 'Area', 'Production', 'Annual_Rainfall',
       'Yield', 'Input_Per_Unit_Area', 'Year_Interval'],
      dtype='object')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19651 entries, 0 to 19688
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Crop                 19651 non-null  object  
 1   Season               19651 non-null  object  
 2   State                19651 non-null  object  
 3   Area                 19651 non-null  float64 
 4   Production           19651 non-null  int64   
 5   Annual_Rainfall      19651 non-null  float64 
 6   Yield                19651 non-null  float64 
 7   Input_Per_Unit_Area  19651 non-null  float64 
 8   Year_Interval        19651 non-null  category
dtypes: category(1), float64(4), int64(1), object(3)
memory usage: 1.4+ MB


In [17]:
df.describe()

Unnamed: 0,Area,Production,Annual_Rainfall,Yield,Input_Per_Unit_Area
count,19651.0,19651.0,19651.0,19651.0,19651.0
mean,180227.2,16467210.0,1437.967162,80.100282,0.137329
std,733500.0,263310100.0,817.676055,879.148542,0.026137
min,0.5,0.0,301.3,0.0,0.0948
25%,1396.0,1400.0,940.7,0.599372,0.10855
50%,9328.0,13830.0,1247.0,1.03,0.14476
75%,75238.0,123005.5,1643.7,2.390714,0.158
max,50808100.0,6326000000.0,6552.7,21105.0,0.1725


In [18]:
df['Year_Interval'].unique()

['90s', '2000s', '2010s']
Categories (3, object): ['90s' < '2000s' < '2010s']

In [19]:
df.columns

Index(['Crop', 'Season', 'State', 'Area', 'Production', 'Annual_Rainfall',
       'Yield', 'Input_Per_Unit_Area', 'Year_Interval'],
      dtype='object')

In [20]:
df.head(5)

Unnamed: 0,Crop,Season,State,Area,Production,Annual_Rainfall,Yield,Input_Per_Unit_Area,Year_Interval
0,Arecanut,Whole Year,Assam,73814.0,56708,2051.4,0.796087,0.09548,90s
1,Arhar/Tur,Kharif,Assam,6637.0,4685,2051.4,0.710435,0.09548,90s
2,Castor seed,Kharif,Assam,796.0,22,2051.4,0.238333,0.09548,90s
3,Coconut,Whole Year,Assam,19656.0,126905000,2051.4,5238.051739,0.09548,90s
4,Cotton(lint),Kharif,Assam,1739.0,794,2051.4,0.420909,0.09548,90s


In [21]:
num_col = ['Area','Production','Annual_Rainfall','Input_Per_Unit_Area']
ordinal_col = ['Season','Year_Interval']
nominal_col = ['Crop','State']
season_order = ['Winter','Summer','Autumn','Rabi','Kharif','Whole Year']
year_order = ['90s','2000s','2010s']


In [22]:
df['Season'].unique()

array(['Whole Year', 'Kharif', 'Rabi', 'Autumn', 'Summer', 'Winter'],
      dtype=object)

In [23]:
from sklearn.preprocessing import RobustScaler, PowerTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [24]:
X = df.drop(columns=['Yield'])
y = df[['Yield']]

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## data encoding

In [26]:
preprocessor = ColumnTransformer(transformers=[
    ('yeo_johnson_transform', PowerTransformer(method='yeo-johnson'), num_col),
    ('robust_scaler', RobustScaler(),num_col),
    ('Season_order', OrdinalEncoder(categories=[['Winter','Summer','Autumn','Rabi','Kharif','Whole Year']]), ['Season']),
    ('Year_order',OrdinalEncoder(categories=[['90s','2000s','2010s']]), ['Year_Interval']),
    ('OHE', OneHotEncoder(drop='first',handle_unknown='ignore'), nominal_col)
], remainder='passthrough')

# Model training

In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression

In [28]:
# Feature selection using KBest method to reduce curse of dimensionality
kbest = SelectKBest(score_func=f_regression, k=40)

In [29]:
# After hyperparameter tuning best parameters are selected to reduce runtime and increse performance of model
rf = RandomForestRegressor(max_features=0.75, max_samples=0.75, n_estimators=400, n_jobs=-1)
rf1 = RandomForestRegressor()

In [30]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('kbest', kbest),
    ('RF_regressor', rf1)
])

In [31]:
pipe.fit(X_train,y_train.values.ravel())

In [32]:
y_pred = pipe.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score, r2_score
r2_score(y_test,y_pred)

0.9261246549438904