# Model Building

We are going to use ensemble learning algorithms to build the model. Before building and training the model, we need to separate the dependent variable and split the dataset into training and testing set.

In [1]:
##Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt 

In [2]:
## For ignoring warnings to view clean output
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
## Importing the dataset
df = pd.read_csv('cleaned.csv',header=0)

In [4]:
df.sample(10)

Unnamed: 0,Year,Location,Soil Type,Max_Temperature,Min_Teperature,Rainfall,Humidity,Yield Kg/ha
73,2019,Chefe Donsa,Black Soil,22.5,14.0,811.2,70.0,380.0
5365,2019,DZ-BS,Black Soil,30.0,13.0,811.2,46.0,1100.0
1910,2018,DZ-BS,Black Soil,26.5,12.0,543.8,70.0,2186.0
1490,2018,DZ-LS,Light Soil,29.5,12.5,543.8,62.0,2185.0
397,2019,Akaki,Black Soil,32.0,13.0,811.2,50.0,110.0
632,2019,DZ-BS,Black Soil,25.0,14.5,811.2,50.0,1840.0
4576,2019,Akaki,Black Soil,25.5,14.0,811.2,47.0,290.0
6330,2014,Chefe Donsa,Black Soil,25.0,12.5,559.8,72.0,254.0
4708,2019,DZ-BS,Black Soil,26.0,15.5,811.2,44.0,750.0
6765,2014,Chefe Donsa,Black Soil,23.5,12.0,559.8,53.0,655.0


In [5]:
df.drop(['Year'],axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,Location,Soil Type,Max_Temperature,Min_Teperature,Rainfall,Humidity,Yield Kg/ha
0,Chefe Donsa,Black Soil,31.0,13.0,811.2,68.0,980.0
1,Chefe Donsa,Black Soil,30.5,16.0,811.2,72.0,670.0
2,Chefe Donsa,Black Soil,29.5,14.0,811.2,68.0,730.0
3,Chefe Donsa,Black Soil,30.0,13.0,811.2,54.0,710.0
4,Chefe Donsa,Black Soil,31.0,12.0,811.2,70.0,780.0


In [7]:
df.shape

(8077, 7)

In [8]:
### Separating Independent and Dependent feature
X = df.iloc[:,:-1]
y = df.iloc[:, 6]

In [9]:
X.head()

Unnamed: 0,Location,Soil Type,Max_Temperature,Min_Teperature,Rainfall,Humidity
0,Chefe Donsa,Black Soil,31.0,13.0,811.2,68.0
1,Chefe Donsa,Black Soil,30.5,16.0,811.2,72.0
2,Chefe Donsa,Black Soil,29.5,14.0,811.2,68.0
3,Chefe Donsa,Black Soil,30.0,13.0,811.2,54.0
4,Chefe Donsa,Black Soil,31.0,12.0,811.2,70.0


In [10]:
X.shape

(8077, 6)

In [11]:
y.head()

0    980.0
1    670.0
2    730.0
3    710.0
4    780.0
Name: Yield Kg/ha, dtype: float64

### Data Transformation
#### Handling Categorical Variables - Creating Dummy Variables( Changing the categorical value to numeric(0,1))

In [12]:
# Shows the number of columns after creating dummy variables
pd.get_dummies(df,drop_first=True).shape

(8077, 12)

In [13]:
X = pd.get_dummies(X, drop_first=True)

In [14]:
X.sample(4)

Unnamed: 0,Max_Temperature,Min_Teperature,Rainfall,Humidity,Location_Alemtena,Location_Chefe Donsa,Location_DZ-BS,Location_DZ-LS,Location_Denkaka,Location_Minjar,Soil Type_Light Soil
4420,25.5,14.5,811.2,53.0,0,0,1,0,0,0,0
6401,26.5,13.0,559.8,45.0,0,0,1,0,0,0,0
2954,24.5,15.0,543.8,54.0,0,0,0,0,1,0,0
841,23.5,10.0,811.2,53.0,0,0,1,0,0,0,0


In [15]:
X.shape

(8077, 11)

### Standardization 
Before building the model we need to standardize the features

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [27]:
## shows the wheather the features are in standardized format or not
print("Mean of the dataset:", np.mean(X).round(8))
print("Standard deviation of the dataset: ", np.std(X).round(8))

Mean of the dataset: -0.0
Standard deviation of the dataset:  1.0


### Splitting the dataset into training and testing set

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
# print the shapes of our training and test set
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6461, 11)
(1616, 11)
(6461,)
(1616,)


### Model Building with Random Forest Regressor

In [30]:
from sklearn.ensemble import RandomForestRegressor
import time
start_time = time.time()
RF=RandomForestRegressor()
# feeding the training data into the model
RF.fit(X_train, y_train)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 1.6235687732696533 sec


In [31]:
# predicting the values for x-test
predictions=RF.predict(X_test)

In [32]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MAE: 463.93614231199905
MSE: 448882.81443564955
RMSE: 669.9871748292272
