<center><h1> Automatic Soil Testing Using AI </h1></center>
<br>
<br>

# Implementing Models 

### Reading processed data

In [565]:
import pandas as pd
import numpy as np
import pickle

pd.set_option('display.max_rows',200)


data = pd.read_csv('processed_soil_data.csv')


X, Y = data[data.columns[1:]], data['Vegetation Cover']


print(X[:10])
# Normalizing data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X, Y = scaler.fit_transform(X.values), scaler.fit_transform(Y.values.reshape(-1,1))


     NO3   NH4    P   K  SO4     B  Organic Matter   pH   Zn   Cu  Fe   Ca  \
0  23.00  2.25  101  32   12  0.71            0.96  7.8  1.7  0.3   4  6.6   
1   6.00  0.75   58  10    7  0.27            0.30  8.2  0.8  0.1   1  5.0   
2   7.25  1.00  115   8   10  0.56            0.62  7.9  1.4  0.2   2  5.2   
3  21.00  1.25  130  30   13  0.78            1.04  7.9  1.8  0.3   2  6.1   
4   1.75  0.50   17   4    6  0.25            0.10  8.8  0.3  0.1   1  4.7   
5   2.00  0.75   40   8    6  0.24            0.22  8.3  0.9  0.1   1  4.9   
6   1.50  0.25   21  12    5  0.20            0.08  8.7  0.3  0.1   1  5.7   
7  14.75  0.75   53  12    8  0.18            0.41  8.3  0.8  0.1   1  5.6   
8  12.75  2.00   72  15   10  0.64            0.45  8.0  1.2  0.2   2  5.6   
9  18.50  1.75   47  21    7  0.29            0.28  8.3  0.6  0.1   1  5.9   

    Mg    Na  
0  0.8  0.12  
1  0.5  0.07  
2  0.4  0.04  
3  0.7  0.10  
4  0.5  0.10  
5  0.5  0.09  
6  0.4  0.08  
7  0.4  0.07  
8  0.5

## Implementing ML models

### Useful functions to implement a general model

In [429]:
from sklearn.metrics import r2_score

def train(model, X, Y):
    model.fit(X,Y)
    return model

def print_metrics(model, X, Y):
    
    # predicted test data
    y_pred = model.predict(X)
    
    # Mean square Error
    mean_square_error = np.sum((y_pred-Y)**2)/len(y_pred)
    print('Mean Square Error : ', mean_square_error)
    
    # Root Mean Square Error
    print('Root Mean Square Error : ', mean_square_error**0.5)
    
    # Mean absolute Error
    print('Mean Absolute Error : ', np.abs(Y - y_pred).sum()/len(y_pred))
    
    # R2 Error
    print('R2 Score : ', r2_score(Y, y_pred))
    

### Spliting data into train and test set

In [430]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.10, random_state=43)


## Implement Linear Model

In [558]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

linearModel = LinearRegression()

# training
linearModel = train(linearModel,X_train,Y_train)

#print metrics
print_metrics(linearModel, X_test, Y_test)

#save model
pickle.dump(linearModel, open('Models/Linear Model', 'wb'))


Mean Square Error :  0.1080737308531992
Root Mean Square Error :  0.3287456932846409
Mean Absolute Error :  0.30683537447564435
R2 Score :  0.119984517017235


### Implement Lasso and Ridge

In [559]:
#Lasso
lasso = Lasso(alpha=0.6)

#training 
lasso = train(lasso,X_train,Y_train)

#print metrics
print('Lasso')
print_metrics(lasso, X_test, Y_test)

#save model
pickle.dump(Lasso, open('Models/Lasso', 'wb'))

print('\n')
#Ridge
ridge = Ridge(alpha=1)

#training
ridge = train(ridge,X_train,Y_train)

#print_metrics
print('Ridge')
print_metrics(ridge,X_test,Y_test)

#save model
pickle.dump(ridge, open('Models/Ridge', 'wb'))


Lasso
Mean Square Error :  1.309367160894724
Root Mean Square Error :  1.1442758237832014
Mean Absolute Error :  3.3318239869964
R2 Score :  -0.06618265641417254


Ridge
Mean Square Error :  0.10308474975935131
Root Mean Square Error :  0.3210681388106757
Mean Absolute Error :  0.2941749944200712
R2 Score :  0.16060845562131687


#### Here, Ridge seems to work better than Lasso and Simple linear model

## Implement SVR

In [560]:
from sklearn.svm import SVR


# linear Model
svr_linear = SVR(kernel='linear', C=200, epsilon=0.3)

# RBF kernel
svr_rbf = SVR(kernel='rbf', C=10, gamma=0.2)

# non-linear model
svr_poly = SVR(kernel='poly', degree=2, C=650, epsilon=0.3)


print('Kernel : Linear')
svr_linear = train(svr_linear,X_train,Y_train)
print_metrics(svr_linear, X_test, Y_test)

print('\n')
print('Kernel : RBF')
svr_rbf = train(svr_rbf,X_train,Y_train)
print_metrics(svr_rbf, X_test, Y_test)

print("\n")
print('Kernel : Poly')
svr_poly = train(svr_poly,X_train,Y_train)
print_metrics(svr_poly, X_test, Y_test)

#save model
pickle.dump(svr_linear, open('Models/SVR (Linear)', 'wb'))


#save model
pickle.dump(svr_poly, open('Models/SVR (poly)', 'wb'))


Kernel : Linear
Mean Square Error :  1.3703070747399122
Root Mean Square Error :  1.170601159550046
Mean Absolute Error :  3.2688004338889898
R2 Score :  0.317595250440739


Kernel : RBF
Mean Square Error :  2.265151800673897
Root Mean Square Error :  1.5050421258801685
Mean Absolute Error :  4.046706519001624
R2 Score :  -0.1549667925623004


Kernel : Poly
Mean Square Error :  1.3757893146053637
Root Mean Square Error :  1.1729404565472894
Mean Absolute Error :  3.211327987361842
R2 Score :  0.25895631954388965


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


#### Here, SVR with kernel as linear seems to work better than rbf kernel and poly kernel 

## Implement Decision Tree Regressor

In [561]:
from sklearn.tree import DecisionTreeRegressor

treeRegressor = DecisionTreeRegressor(criterion='friedman_mse', max_depth=5, min_samples_leaf=5)

#training
treeRegressor = train(treeRegressor, X_train, Y_train)

#print metrics
print_metrics(treeRegressor, X_test, Y_test)

#save model
pickle.dump(treeRegressor, open('Models/Decision Tree Regressor', 'wb'))


Mean Square Error :  1.9807459017032563
Root Mean Square Error :  1.4073897476190653
Mean Absolute Error :  3.6865421895724935
R2 Score :  0.5758089618179321


### Till now, best model that worked here is Decision Tree Regressor 

## Implement Random Forest Regressor

In [562]:
from sklearn.ensemble import RandomForestRegressor

forestRegressor = RandomForestRegressor(criterion='mae', max_depth=8, n_estimators=7, random_state=0)

#training 
forestRegressor = train(forestRegressor, X_train, Y_train.flatten())

#print metrics
print_metrics(forestRegressor, X_test, Y_test)

#save model
pickle.dump(forestRegressor, open('Models/Random Forest Regressor', 'wb'))


Mean Square Error :  1.5802489958334118
Root Mean Square Error :  1.2570795503202699
Mean Absolute Error :  3.529004329004329
R2 Score :  0.4116568905633121


### Random forest Regressor is betten than SVR and other linear models but works slightly worse than Decision tree regressor may be due to overfitting

## Implementing Neural Network

In [563]:
from sklearn.neural_network import MLPRegressor 

nnRegressor = MLPRegressor(hidden_layer_sizes=(20,10,4), #hidden layers
                           activation='relu',#activation function after each layer
                           learning_rate_init = 0.01,    #learning rate for optimization
                           max_iter=500,            #number of iteration for training loop
                           random_state=0)

#training 
nnRegressor = train(nnRegressor, X_train, Y_train.flatten())

#print metrics
print_metrics(nnRegressor, X_test, Y_test)

#save model
pickle.dump(nnRegressor, open('Models/Neural Network', 'wb'))


Mean Square Error :  1.7050807758649544
Root Mean Square Error :  1.3057874160310148
Mean Absolute Error :  3.526882915381562
R2 Score :  0.4101752509438138


### Neural Network is also betten than SVR and linear models but works slightly worse than Random forest regressor and Decision Tree Regressor

## Results

#### The given list of models worked differently, the order of their performance is given below (top-to-bottom is high-to-low)

<ol>
    <li> Decision Tree Regressor </li> <br>
    <li> Random Forest Regressor </li> <br>
    <li> Neural Network </li> <br>
    <li> SVR (Linear) </li> <br>
    <li> SVR (Poly) </li> <br>
    <li> Ridge </li> <br>
    <li> Linear Model </li> <br>
    <li> Lasso </li> <br>
</ol>