In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('crop_yield.csv')
df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB


In [4]:
df['Season'].unique()

array(['Whole Year ', 'Kharif     ', 'Rabi       ', 'Autumn     ',
       'Summer     ', 'Winter     '], dtype=object)

In [5]:
df['State'].unique()

array(['Assam', 'Karnataka', 'Kerala', 'Meghalaya', 'West Bengal',
       'Puducherry', 'Goa', 'Andhra Pradesh', 'Tamil Nadu', 'Odisha',
       'Bihar', 'Gujarat', 'Madhya Pradesh', 'Maharashtra', 'Mizoram',
       'Punjab', 'Uttar Pradesh', 'Haryana', 'Himachal Pradesh',
       'Tripura', 'Nagaland', 'Chhattisgarh', 'Uttarakhand', 'Jharkhand',
       'Delhi', 'Manipur', 'Jammu and Kashmir', 'Telangana',
       'Arunachal Pradesh', 'Sikkim'], dtype=object)

In [6]:
df['Crop'].unique()

array(['Arecanut', 'Arhar/Tur', 'Castor seed', 'Coconut ', 'Cotton(lint)',
       'Dry chillies', 'Gram', 'Jute', 'Linseed', 'Maize', 'Mesta',
       'Niger seed', 'Onion', 'Other  Rabi pulses', 'Potato',
       'Rapeseed &Mustard', 'Rice', 'Sesamum', 'Small millets',
       'Sugarcane', 'Sweet potato', 'Tapioca', 'Tobacco', 'Turmeric',
       'Wheat', 'Bajra', 'Black pepper', 'Cardamom', 'Coriander',
       'Garlic', 'Ginger', 'Groundnut', 'Horse-gram', 'Jowar', 'Ragi',
       'Cashewnut', 'Banana', 'Soyabean', 'Barley', 'Khesari', 'Masoor',
       'Moong(Green Gram)', 'Other Kharif pulses', 'Safflower',
       'Sannhamp', 'Sunflower', 'Urad', 'Peas & beans (Pulses)',
       'other oilseeds', 'Other Cereals', 'Cowpea(Lobia)',
       'Oilseeds total', 'Guar seed', 'Other Summer Pulses', 'Moth'],
      dtype=object)

In [7]:
df['Crop'].value_counts()

Crop
Rice                     1197
Maize                     975
Moong(Green Gram)         740
Urad                      733
Groundnut                 725
Sesamum                   685
Potato                    628
Sugarcane                 605
Wheat                     545
Rapeseed &Mustard         528
Bajra                     524
Jowar                     513
Arhar/Tur                 508
Ragi                      498
Gram                      490
Small millets             485
Cotton(lint)              476
Onion                     454
Sunflower                 441
Dry chillies              419
Other Kharif pulses       382
Horse-gram                371
Peas & beans (Pulses)     369
Tobacco                   364
Other  Rabi pulses        355
Soyabean                  349
Turmeric                  337
Masoor                    324
Ginger                    323
Linseed                   308
Castor seed               300
Barley                    297
Sweet potato              273
Garli

In [8]:
df.drop(df.index[df['Crop'].isin(['Other Summer Pulses', 'Other  Rabi pulses', 'Other Kharif pulses'])], inplace=True)

In [9]:
df['Crop'].unique()

array(['Arecanut', 'Arhar/Tur', 'Castor seed', 'Coconut ', 'Cotton(lint)',
       'Dry chillies', 'Gram', 'Jute', 'Linseed', 'Maize', 'Mesta',
       'Niger seed', 'Onion', 'Potato', 'Rapeseed &Mustard', 'Rice',
       'Sesamum', 'Small millets', 'Sugarcane', 'Sweet potato', 'Tapioca',
       'Tobacco', 'Turmeric', 'Wheat', 'Bajra', 'Black pepper',
       'Cardamom', 'Coriander', 'Garlic', 'Ginger', 'Groundnut',
       'Horse-gram', 'Jowar', 'Ragi', 'Cashewnut', 'Banana', 'Soyabean',
       'Barley', 'Khesari', 'Masoor', 'Moong(Green Gram)', 'Safflower',
       'Sannhamp', 'Sunflower', 'Urad', 'Peas & beans (Pulses)',
       'other oilseeds', 'Other Cereals', 'Cowpea(Lobia)',
       'Oilseeds total', 'Guar seed', 'Moth'], dtype=object)

In [10]:
df.size

189420

In [11]:
df.describe()

Unnamed: 0,Crop_Year,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
count,18942.0,18942.0,18942.0,18942.0,18942.0,18942.0,18942.0
mean,2009.117992,185794.2,17083490.0,1432.762949,24908210.0,50455.87,83.077903
std,6.498985,746263.0,268173300.0,816.626293,96688370.0,217234.8,895.314504
min,1997.0,0.5,0.0,301.3,54.17,0.09,0.0
25%,2004.0,1418.25,1465.0,935.6,191685.3,363.73,0.606104
50%,2010.0,9838.0,15399.0,1246.2,1301196.0,2541.22,1.065486
75%,2015.0,79377.25,132471.5,1635.9,10533920.0,20954.53,2.5
max,2020.0,50808100.0,6326000000.0,6552.7,4835407000.0,15750510.0,21105.0


In [12]:
a = [x for x in range(100)]
labels = [
    dict(zip(df["Crop"].unique(), a)),
    dict(zip(df["Season"].unique(), a)),
    dict(zip(df["State"].unique(), a))
]

df["Crop"] = df["Crop"].map(labels[0])
df["Season"] = df["Season"].map(labels[1])
df["State"] = df["State"].map(labels[2])

df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,0,1997,0,0,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,1,1997,1,0,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,2,1997,1,0,796.0,22,2051.4,75755.32,246.76,0.238333
3,3,1997,0,0,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,4,1997,1,0,1739.0,794,2051.4,165500.63,539.09,0.420909


In [13]:
minMax = MinMaxScaler()
ct = ColumnTransformer(transformers=[('MMS', minMax, [1, 4, 5, 6, 7, 8])], remainder='passthrough')

In [14]:
X = ct.fit_transform(df.drop('Yield', axis=1))
y = df['Yield']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [22]:
rf = RandomForestRegressor()
model = rf.fit(X_train, y_train)

In [23]:
y_pred = model.predict(X_test)
print("MSE: ", mean_squared_error(y_test, y_pred))
print("r2 score: ", r2_score(y_test, y_pred))

y_pred_train = model.predict(X_train)
print("MSE: ", mean_squared_error(y_train, y_pred_train))
print("r2 score: ", r2_score(y_train, y_pred_train))

MSE:  16784.633776456376
r2 score:  0.98271312503144
MSE:  4370.756448697075
r2 score:  0.9942421718677432


In [25]:
import joblib

joblib.dump(model, "model.jbl.lzma")
joblib.dump(ct, "colTrans.jbl.lzma")

['colTrans.jbl.lzma']