# Crop yield Prediction

### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### load the dataset

In [2]:
data = pd.read_excel('crop_csv_file.xlsx')

In [3]:
data.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Temperature,humidity,soil moisture,area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,36,35,45,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,37,40,46,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,36,41,50,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,37,42,55,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,36,40,54,720.0,165.0


In [4]:
data.head()
data = data[:500]

In [5]:
data

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Temperature,humidity,soil moisture,area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,36,35,45,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,37,40,46,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,36,41,50,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,37,42,55,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,36,40,54,720.0,165.0
...,...,...,...,...,...,...,...,...,...,...
495,Andhra Pradesh,ANANTAPUR,2004,Kharif,Castor seed,36,35,45,1578.0,923.0
496,Andhra Pradesh,ANANTAPUR,2004,Kharif,Cotton(lint),37,40,46,8933.0,7041.0
497,Andhra Pradesh,ANANTAPUR,2004,Kharif,Dry chillies,36,41,50,3402.0,11288.0
498,Andhra Pradesh,ANANTAPUR,2004,Kharif,Groundnut,37,42,55,857823.0,684543.0


### Data Exploration

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   State_Name     500 non-null    object 
 1   District_Name  500 non-null    object 
 2   Crop_Year      500 non-null    int64  
 3   Season         500 non-null    object 
 4   Crop           500 non-null    object 
 5   Temperature    500 non-null    int64  
 6   humidity       500 non-null    int64  
 7   soil moisture  500 non-null    int64  
 8    area          500 non-null    float64
 9   Production     498 non-null    float64
dtypes: float64(2), int64(4), object(4)
memory usage: 39.2+ KB


In [7]:
#handling missing data
data = data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 0 to 499
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   State_Name     498 non-null    object 
 1   District_Name  498 non-null    object 
 2   Crop_Year      498 non-null    int64  
 3   Season         498 non-null    object 
 4   Crop           498 non-null    object 
 5   Temperature    498 non-null    int64  
 6   humidity       498 non-null    int64  
 7   soil moisture  498 non-null    int64  
 8    area          498 non-null    float64
 9   Production     498 non-null    float64
dtypes: float64(2), int64(4), object(4)
memory usage: 42.8+ KB


In [8]:
data.describe()

Unnamed: 0,Crop_Year,Temperature,humidity,soil moisture,area,Production
count,498.0,498.0,498.0,498.0,498.0,498.0
mean,2002.405622,34.459839,44.714859,53.076305,16368.695582,1484588.0
std,3.43437,3.49639,6.654677,5.248187,93281.097184,8502200.0
min,1997.0,25.0,35.0,45.0,0.2,0.0
25%,2000.0,34.0,40.0,50.0,85.5,100.0
50%,2002.0,36.0,42.0,54.0,652.725,826.41
75%,2004.0,36.0,50.0,55.0,3391.0,10238.35
max,2010.0,37.0,55.0,62.0,857823.0,71300000.0


# Categorical data handling

In [9]:
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
le = LabelEncoder()

State_Name = le.fit_transform(data.State_Name)
District_Name = le.fit_transform(data.District_Name)
#Crop_Year = le.fit_transform(data.Crop_Year)
crop = le.fit_transform(data.Crop)
Season = le.fit_transform(data.Season)
data['State_Name'] = State_Name
data['District_Name'] = District_Name
data['Crop'] = crop
data['Season']  = Season

In [10]:
data.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Temperature,humidity,soil moisture,area,Production
0,0,1,2000,1,0,36,35,45,1254.0,2000.0
1,0,1,2000,1,32,37,40,46,2.0,1.0
2,0,1,2000,1,38,36,41,50,102.0,321.0
3,0,1,2000,3,3,37,42,55,176.0,641.0
4,0,1,2000,3,8,36,40,54,720.0,165.0


# Splitting the dataset for train and test

In [11]:
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split

In [12]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [13]:
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.2,random_state=100)

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score , classification_report, mean_squared_error, r2_score
forest = RandomForestRegressor(n_estimators=1000, 
                               criterion='mse', 
                               random_state=1, 
                               n_jobs=-1)
forest.fit(X_train, Y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(Y_train, y_train_pred),
        mean_squared_error(Y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(Y_train, y_train_pred),
        r2_score(Y_test, y_test_pred)))



MSE train: 728970984646.353, test: 26682633827305.480
R^2 train: 0.991, test: 0.498


In [15]:
print(forest.score(X_test,Y_test))

0.49778082627763265


In [16]:
forest.predict(X_test)

array([8.11403020e+02, 1.00810826e+04, 2.36741366e+03, 1.92515450e+04,
       1.13247600e+02, 1.14762250e+02, 3.07825429e+03, 3.62676330e+03,
       1.27188416e+05, 8.01591147e+05, 2.82478978e+03, 2.60175629e+05,
       6.55914774e+03, 9.93564116e+06, 1.98508238e+03, 2.24197826e+06,
       2.74462002e+03, 1.62484618e+03, 1.33776520e+04, 1.77714950e+02,
       1.28973246e+05, 8.93879130e+02, 2.46759142e+04, 2.46065080e+02,
       2.56263490e+04, 4.57280517e+04, 9.46904140e+02, 4.72281236e+03,
       3.51523075e+07, 2.94407400e+01, 1.67409200e+03, 1.77140640e+02,
       2.03993064e+03, 8.30137120e+02, 1.17031725e+04, 8.29772080e+02,
       6.92223147e+03, 4.29473832e+03, 6.40813701e+03, 1.60681532e+06,
       1.85955550e+03, 5.20850396e+03, 1.77522433e+04, 1.53728599e+04,
       8.45081880e+03, 2.72898316e+03, 5.55906910e+02, 5.46273850e+04,
       3.72148780e+07, 2.49019810e+03, 4.59277400e+01, 1.19240610e+03,
       2.83849763e+05, 8.63127760e+02, 4.99875180e+02, 1.30259960e+02,
      

In [17]:
X_test

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Temperature,humidity,soil moisture,area
105,0,2,2006,3,16,34,55,62,98.66
138,0,3,2002,3,44,37,42,55,455.00
142,0,3,2003,3,6,35,50,59,497.00
461,1,0,2003,2,19,36,41,50,15060.00
355,1,0,2001,2,1,36,40,54,5.00
...,...,...,...,...,...,...,...,...,...
268,1,0,1999,1,38,35,50,59,37991.00
307,1,0,2000,1,42,37,40,46,1647.00
58,0,1,2006,3,15,36,40,54,3.00
392,1,0,2002,1,28,34,45,52,776.00


In [18]:
forest.predict([[1,5,5,3,40,37,40,46,1359.0]])

array([79537.7254])

In [19]:
state = input('enter state:')
district = input('enter district:')
year = input('enter year:')
season = input('enter season:')
crop = input('enter crop:')
Temperature = input('enter Temperature')
humidity= input('enter humidity')
soilmoisture= input('enter soilmoisture')
area = input('enter area:13')

out_1 = forest.predict([[float(state),
       float(district),
       float(year),
       float(season),
       float(crop),
       float(Temperature),
       float(humidity),
       float(soilmoisture),
       float(area)]])
print(out_1)
print('crop yield Production:',out_1)


enter state:4
enter district:4
enter year:4
enter season:4
enter crop:4
enter Temperature4
enter humidity4
enter soilmoisture4
enter area:134
[10138.23862]
crop yield Production: [10138.23862]


# Fertilizer prediction

In [20]:
data_1 = pd.read_csv('ferlizer.csv')

In [21]:
data_1

Unnamed: 0,n,p,k,amt of n,amt of p,amt of k
0,3,5,6,64,50,60
1,1,4,2,40,46,30
2,5,1,5,93,16,32
3,3,1,3,63,20,39
4,4,6,6,87,37,39
...,...,...,...,...,...,...
3235,1,4,2,40,46,30
3236,5,1,5,93,16,32
3237,3,1,3,63,20,39
3238,4,6,6,87,37,39


In [22]:
n = data_1.iloc[:,0:1]
p = data_1.iloc[:,1:2]
k = data_1.iloc[:,2:3]
amt_n = data_1.iloc[:,3:4]
amt_p = data_1.iloc[:,4:5]
amt_k = data_1.iloc[:,5:6]

In [23]:
X_n_train , X_n_test , y_n_train , y_n_test = train_test_split(n,amt_n,test_size=0.2,random_state=100)

In [24]:
X_p_train , X_p_test , y_p_train , y_p_test = train_test_split(p,amt_p,test_size=0.2,random_state=100)

In [25]:
X_k_train , X_k_test , y_k_train , y_k_test = train_test_split(k,amt_k,test_size=0.2,random_state=100)

In [26]:
from sklearn.neural_network import MLPRegressor
clf_n = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf_n.fit(X_n_train, y_n_train)
y_n_pred = clf_n.predict(y_n_test)
clf_n.score(y_n_test,y_n_pred)

  y = column_or_1d(y, warn=True)
Feature names unseen at fit time:
- amt of n
Feature names seen at fit time, yet now missing:
- n

Feature names unseen at fit time:
- amt of n
Feature names seen at fit time, yet now missing:
- n



1.0

In [27]:
clf_p = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf_p.fit(X_p_train, y_p_train)
y_p_pred = clf_n.predict(y_p_test)
clf_p.score(y_p_test,y_p_pred)

  y = column_or_1d(y, warn=True)
Feature names unseen at fit time:
- amt of p
Feature names seen at fit time, yet now missing:
- n

Feature names unseen at fit time:
- amt of p
Feature names seen at fit time, yet now missing:
- p



-1.3101496439657598

In [29]:
clf_k = MLPRegressor(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf_k.fit(X_p_train, y_k_train)
y_k_pred = clf_n.predict(y_k_test)
clf_k.score(y_k_test,y_k_pred)

  y = column_or_1d(y, warn=True)
Feature names unseen at fit time:
- amt of k
Feature names seen at fit time, yet now missing:
- n

Feature names unseen at fit time:
- amt of k
Feature names seen at fit time, yet now missing:
- p



-8.612062745927059

In [31]:
n_i = float(input('enter nitrogen:'))
p_i = float(input('enter posporus:'))
k_i = float(input('enter pottasium:'))
p_n = clf_n.predict([[n_i]])
p_p = clf_p.predict([[p_i]])
p_k = clf_k.predict([[k_i]])
print('Amount of nitrogen Fertizer:',p_n)
print('Amount of posporus Fertizer:',p_p)
print('Amount of nitrogen Fertizer:',p_k)

enter nitrogen:45
enter posporus:45
enter pottasium:45
Amount of nitrogen Fertizer: [604.32031581]
Amount of posporus Fertizer: [268.33727992]
Amount of nitrogen Fertizer: [134.79519645]




In [32]:
state = {"Andaman and Nicobar Islands":0,}

# GUI

In [33]:
from tkinter import *
from tkinter import ttk

root = Tk()
root.title('Crop Yield and fertilizer Prediction System')
root.geometry('850x650')
root.configure(background="purple2")
var = StringVar()
label = Label( root, textvariable = var,font=('arial',20,'bold'),bd=20,background="purple2")
var.set('Crop Yield and fertilizer Prediction System')
label.grid(row=0,columnspan=6)



label_1 = ttk.Label(root, text ='state',font=("Helvetica", 16),background="Purple3")
label_1.grid(row=11,column=0)
    
Entry_1= Entry(root)
Entry_1.grid(row=11,column=1)

label_2 = ttk.Label(root, text ='district',font=("Helvetica", 16),background="Purple3")
label_2.grid(row=12,column=0)
    
Entry_2 = Entry(root)
Entry_2.grid(row=12,column=1)
    
    
label_3 = ttk.Label(root, text ='year',font=("Helvetica", 16),background="Purple3")
label_3.grid(row=13,column=0)
    
Entry_3 = Entry(root)
Entry_3.grid(row=13,column=1)

label_4 = ttk.Label(root, text ='season',font=("Helvetica", 16),background="Purple3")
label_4.grid(row=14,column=0)
    
Entry_4= Entry(root)
Entry_4.grid(row=14,column=1)

label_5 = ttk.Label(root, text ='crop',font=("Helvetica", 16),background="Purple3")
label_5.grid(row=15,column=0)
    
Entry_5 = Entry(root)
Entry_5.grid(row=15,column=1)
    
    
label_6 = ttk.Label(root, text ='Temperature',font=("Helvetica", 16),background="Purple3")
label_6.grid(row=16,column=0)
    
Entry_6 = Entry(root)
Entry_6.grid(row=16,column=1)

label_7 = ttk.Label(root, text ='humidity',font=("Helvetica", 16),background="Purple3")
label_7.grid(row=17,column=0)
    
Entry_7= Entry(root)
Entry_7.grid(row=17,column=1)

label_8 = ttk.Label(root, text ='soilmoisture',font=("Helvetica", 16),background="Purple3")
label_8.grid(row=18,column=0)

Entry_8 = Entry(root)
Entry_8.grid(row=18,column=1)
    
    
label_9 = ttk.Label(root, text ='area',font=("Helvetica", 16),background="Purple3")
label_9.grid(row=19,column=0)
    
Entry_9 = Entry(root)
Entry_9.grid(row=19,column=1)


def predict():
    state = Entry_1.get()
    district = Entry_2.get()
    year = Entry_3.get()
    season = Entry_4.get()
    crop = Entry_5.get()
    Temperature = Entry_6.get()
    humidity = Entry_7.get()
    soilmoisture = Entry_8.get()
    area = Entry_9.get()
    out = forest.predict([[float(state),
       float(district),
       float(year),
       float(season),
       float(crop),
       float(Temperature),
       float(humidity),
       float(soilmoisture),
       float(area)]])
    
    output.delete(0,END)
    output.insert(0,out[0])
   
        

b1 = Button(root, text = 'predict',font=("Helvetica", 16),background="Purple3",command = predict)
b1.grid(row=20,column=0)
    

output = Entry(root)
output.grid(row=20,column=1)
    
root.mainloop()



In [32]:
# from tkinter import *
from tkinter import ttk

root = Tk()
root.title('Crop Yield and fertilizer Prediction System')
root.geometry('850x650')
root.configure(background="purple2")
var = StringVar()
label = Label( root, textvariable = var,font=('arial',20,'bold'),bd=20,background="purple2")
var.set('Crop Yield and fertilizer Prediction System')
label.grid(row=0,columnspan=6)



label_1 = ttk.Label(root, text ='nitrogen',font=("Helvetica", 16),background="Purple3")
label_1.grid(row=11,column=0)
    
Entry_1= Entry(root)
Entry_1.grid(row=11,column=1)

label_2 = ttk.Label(root, text ='posporus',font=("Helvetica", 16),background="Purple3")
label_2.grid(row=12,column=0)
    
Entry_2 = Entry(root)
Entry_2.grid(row=12,column=1)
    
    
label_3 = ttk.Label(root, text ='pottasium',font=("Helvetica", 16),background="Purple3")
label_3.grid(row=13,column=0)
    
Entry_3 = Entry(root)
Entry_3.grid(row=13,column=1)




def predict():
    n_i = Entry_1.get()
    p_i = Entry_2.get()
    k_i = Entry_3.get()

    p_n = clf_n.predict([[float(n_i)]])
    p_p = clf_p.predict([[float(p_i)]])
    p_k = clf_k.predict([[float(k_i)]])
    print('Amount of nitrogen Fertizer required:',p_n)
    print('Amount of posporus Fertizer required:',p_p)
    print('Amount of nitrogen Fertizer required:',p_k)
    
    s = """Amount of nitrogen Fertizer required:{}\nAmount of posporus Fertizer required:{}\nAmount of nitrogen Fertizer required:{}""".format(p_n[0],p_p[0],p_k[0])
    
    output.delete('1.0',END)
    output.insert('1.0',s)
   
        

b1 = Button(root, text = 'predict',font=("Helvetica", 16),background="Purple3",command = predict)
b1.grid(row=20,column=0)
    

output = Text(root)
output.grid(row=20,column=1)
    
root.mainloop()

Amount of nitrogen Fertizer required: [94.22616724]
Amount of posporus Fertizer required: [364.29006435]
Amount of nitrogen Fertizer required: [194.35954069]
