## Importing the libraries

In [283]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


## importing the dataset

In [284]:
df = pd.read_csv('data/Soil_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,SiteInfor,Country,Latitude,Longitude,Elevation,MAT,MAP,SamplingDepth,SamplingThickness,...,CoverCrop,CoverCropGroup,GrainCrop,GrainCropGroup,Fertilization_C,Fertilization_T,Conservation_Type,ControlDescription,Yield_C,Yield_T
0,0,New York,USA,40.71,-74.01,3.5,,,Not-available,Not-available,...,White_clover,Legume,Bean/Beet/Corn/Bean,MTT,,,CC,Fallow,,
1,1,New York,USA,40.71,-74.01,3.5,,,Not-available,Not-available,...,Vetch,Legume,Bean/Beet/Corn/Bean,MTT,,,CC,Fallow,,
2,2,New York,USA,40.71,-74.01,3.5,,,Not-available,Not-available,...,White_mustard,Brassica,Bean/Beet/Corn/Bean,MTT,,,CC,Fallow,4862.0,3990.0
3,3,New York,USA,40.71,-74.01,3.5,,,Not-available,Not-available,...,Alfalfa,Legume,Bean/Beet/Corn/Bean,MTT,,,CC,Fallow,,
4,4,New York,USA,40.71,-74.01,3.5,,,Not-available,Not-available,...,Ryegrass,Rye,Bean/Beet/Corn/Bean,MTT,,,CC,Fallow,,


In [285]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4457 entries, 0 to 4456
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          4457 non-null   int64  
 1   SiteInfor           4457 non-null   object 
 2   Country             4457 non-null   object 
 3   Latitude            4457 non-null   float64
 4   Longitude           4457 non-null   float64
 5   Elevation           4457 non-null   float64
 6   MAT                 582 non-null    float64
 7   MAP                 2070 non-null   float64
 8   SamplingDepth       4038 non-null   object 
 9   SamplingThickness   3950 non-null   object 
 10  SandPerc            2194 non-null   float64
 11  SiltPerc            2194 non-null   float64
 12  Texture             3989 non-null   object 
 13  SoilpH              2223 non-null   float64
 14  CoverCrop           4457 non-null   object 
 15  CoverCropGroup      4457 non-null   object 
 16  GrainC

In [286]:
df = df.dropna(subset=[df.columns[-2], df.columns[-1]])
df.shape

(1958, 24)

In [287]:
df.drop(['MAT','SiteInfor','Country','Fertilization_C','Fertilization_T', 'Unnamed: 0'] , axis = 1 , inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1958 entries, 2 to 4456
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Latitude            1958 non-null   float64
 1   Longitude           1958 non-null   float64
 2   Elevation           1958 non-null   float64
 3   MAP                 1124 non-null   float64
 4   SamplingDepth       1631 non-null   object 
 5   SamplingThickness   1623 non-null   object 
 6   SandPerc            1066 non-null   float64
 7   SiltPerc            1066 non-null   float64
 8   Texture             1663 non-null   object 
 9   SoilpH              1190 non-null   float64
 10  CoverCrop           1958 non-null   object 
 11  CoverCropGroup      1958 non-null   object 
 12  GrainCrop           1958 non-null   object 
 13  GrainCropGroup      1958 non-null   object 
 14  Conservation_Type   1958 non-null   object 
 15  ControlDescription  1958 non-null   object 
 16  Yield_C    

In [288]:
columns_to_fill = ['MAP','SandPerc','SiltPerc','SoilpH']
columns_to_replace = ['SamplingDepth', 'SamplingThickness']

for columns in columns_to_replace:
    df[columns].replace('Not-available', 'Not_available', inplace = True)

df['Texture'].replace('NotAvailable','Not_available',inplace=True)

for columns in columns_to_fill:
    df[columns].fillna(df[columns].mean(), inplace = True)

df['SamplingDepth'].fillna('Not_available',inplace=True)
df['SamplingThickness'].fillna('Not_available',inplace=True)
df['Texture'].fillna('Not_available',inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1958 entries, 2 to 4456
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Latitude            1958 non-null   float64
 1   Longitude           1958 non-null   float64
 2   Elevation           1958 non-null   float64
 3   MAP                 1958 non-null   float64
 4   SamplingDepth       1958 non-null   object 
 5   SamplingThickness   1958 non-null   object 
 6   SandPerc            1958 non-null   float64
 7   SiltPerc            1958 non-null   float64
 8   Texture             1958 non-null   object 
 9   SoilpH              1958 non-null   float64
 10  CoverCrop           1958 non-null   object 
 11  CoverCropGroup      1958 non-null   object 
 12  GrainCrop           1958 non-null   object 
 13  GrainCropGroup      1958 non-null   object 
 14  Conservation_Type   1958 non-null   object 
 15  ControlDescription  1958 non-null   object 
 16  Yield_C    

In [289]:
X = df.iloc[:,:-2].values
Y = df.iloc[:,-2:].values
print(X.shape)

(1958, 16)


## Encoding categorical data

In [290]:
columns_to_encode = [4,5,8,10,11,12,13,14,15]

label_encoder = LabelEncoder()
numeric_transformer = MinMaxScaler()  

for col_index in columns_to_encode:
    X[:, col_index] = label_encoder.fit_transform(X[:, col_index])

ct = ColumnTransformer(transformers = [('num', numeric_transformer, [0,1,2,3,6,7,9])], remainder='passthrough')

X = np.array(ct.fit_transform(X))
Y = np.array(Y)



In [291]:
print(X.shape)
print(Y.shape)

(1958, 16)
(1958, 2)


## Splitting the Dataset into train and test

In [292]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Training the model on the  train dataset

In [293]:
base_regressor = RandomForestRegressor()

multi_output_regressor = MultiOutputRegressor(base_regressor)
multi_output_regressor.fit(X_train, y_train)


## predicting the test results

In [294]:
y_pred = multi_output_regressor.predict(X_test)
np.set_printoptions(precision=2)

comparison = np.concatenate((y_pred, y_test), axis=1)
print(comparison)

[[2596.28 2777.68 2710.   3360.  ]
 [7997.   7797.4  7997.   7827.  ]
 [3483.43 5814.74 4700.   5400.  ]
 ...
 [1548.52 2257.77 1093.16 1600.  ]
 [ 208.64  689.46  221.    639.  ]
 [ 225.33  261.68  445.    551.  ]]


In [295]:
mse_yield_c = mean_squared_error(y_test[:, 0], y_pred[:, 0])
mse_yield_t = mean_squared_error(y_test[:, 1], y_pred[:, 1])

overall_mse = (mse_yield_c + mse_yield_t) / 2 

print(f"Yield_C - MSE: {mse_yield_c}")
print(f"Yield_T - MSE: {mse_yield_t}")
print(f"Overall MSE: {overall_mse}")

Yield_C - MSE: 23611937.992251813
Yield_T - MSE: 23337045.039268892
Overall MSE: 23474491.515760355


In [296]:
r2_yield_c = r2_score(y_test[:, 0], y_pred[:, 0])
r2_yield_t = r2_score(y_test[:, 1], y_pred[:, 1])

overall_r2 = (r2_yield_c + r2_yield_t) / 2

print(f"Yield_C - R2: {r2_yield_c}")
print(f"Yield_T - R2: {r2_yield_t}")
print(f"Overall R2: {overall_r2}")

Yield_C - R2: 0.7917702388090418
Yield_T - R2: 0.8003593237196066
Overall R2: 0.7960647812643242
