# Soy disease evaluation
#### By Wagner Brito

In [74]:
# All necessary imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

Getting the dataset

In [2]:
soybean = pd.read_csv('soybean.csv')

In [3]:
soybean.head()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
0,october,normal,gt-norm,norm,yes,same-lst-yr,low-areas,pot-severe,none,90-100,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
1,august,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,severe,fungicide,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
2,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,fungicide,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
3,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,none,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
4,october,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,pot-severe,none,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker


In [4]:
soybean.describe()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
count,683,683,683,683,683,683,683,683,683,683,...,683,683,683,683,683,683,683,683,683,683
unique,8,3,4,4,3,5,5,4,4,4,...,3,5,5,3,3,3,3,3,4,19
top,september,normal,gt-norm,norm,yes,same-lst-two-yrs,low-areas,pot-severe,none,80-89,...,absent,norm,absent,norm,absent,absent,norm,absent,norm,brown-spot
freq,149,354,459,374,435,219,227,322,305,213,...,625,407,345,476,524,513,532,539,551,92


In [15]:
# Check all dataset columns if have missing values
for c in soybean.columns:
    print(c,'- null total is',soybean[c].isnull().sum())
    #print()

date - null total is 0
plant-stand - null total is 0
precip - null total is 0
temp - null total is 0
hail - null total is 0
crop-hist - null total is 0
area-damaged - null total is 0
severity - null total is 0
seed-tmt - null total is 0
germination - null total is 0
plant-growth - null total is 0
leaves - null total is 0
leafspots-halo - null total is 0
leafspots-marg - null total is 0
leafspot-size - null total is 0
leaf-shread - null total is 0
leaf-malf - null total is 0
leaf-mild - null total is 0
stem - null total is 0
lodging - null total is 0
stem-cankers - null total is 0
canker-lesion - null total is 0
fruiting-bodies - null total is 0
external-decay - null total is 0
mycelium - null total is 0
int-discolor - null total is 0
sclerotia - null total is 0
fruit-pods - null total is 0
fruit-spots - null total is 0
seed - null total is 0
mold-growth - null total is 0
seed-discolor - null total is 0
seed-size - null total is 0
shriveling - null total is 0
roots - null total is 0
cla

According the specification that I receive the class have the name of disases

In [18]:
# Looking the disease count 
soybean['class'].value_counts()

brown-spot                     92
alternarialeaf-spot            91
frog-eye-leaf-spot             91
phytophthora-rot               88
anthracnose                    44
brown-stem-rot                 44
rhizoctonia-root-rot           20
powdery-mildew                 20
downy-mildew                   20
diaporthe-stem-canker          20
bacterial-pustule              20
purple-seed-stain              20
phyllosticta-leaf-spot         20
charcoal-rot                   20
bacterial-blight               20
2-4-d-injury                   16
diaporthe-pod-&-stem-blight    15
cyst-nematode                  14
herbicide-injury                8
Name: class, dtype: int64

In [25]:
# Change the attributes to numeric pattern 
previsores = soybean.iloc[:,0:35].values
classe = soybean.iloc[:,35].values

In [29]:
previsores[34]

array(['june', 'lt-normal', 'gt-norm', 'gt-norm', '?', 'same-lst-two-yrs',
       'low-areas', '?', '?', '?', 'abnorm', 'abnorm', '?', '?', '?', '?',
       '?', '?', 'abnorm', '?', 'above-soil', 'dk-brown-blk', '?',
       'absent', 'absent', 'none', 'absent', '?', '?', '?', '?', '?', '?',
       '?', 'rotted'], dtype=object)

Apparently has missing data in this dataset!

In [31]:
# Looking the values more closely
for c_name in soybean.columns:
    print(c_name,": ",soybean[c_name].unique())

date :  ['october' 'august' 'july' 'september' 'may' 'april' 'june' '?']
plant-stand :  ['normal' 'lt-normal' '?']
precip :  ['gt-norm' 'lt-norm' 'norm' '?']
temp :  ['norm' 'gt-norm' 'lt-norm' '?']
hail :  ['yes' 'no' '?']
crop-hist :  ['same-lst-yr' 'same-lst-two-yrs' 'same-lst-sev-yrs' 'diff-lst-year' '?']
area-damaged :  ['low-areas' 'scattered' 'whole-field' 'upper-areas' '?']
severity :  ['pot-severe' 'severe' '?' 'minor']
seed-tmt :  ['none' 'fungicide' '?' 'other']
germination :  ['90-100' '80-89' 'lt-80' '?']
plant-growth :  ['abnorm' 'norm' '?']
leaves :  ['abnorm' 'norm']
leafspots-halo :  ['absent' '?' 'no-yellow-halos' 'yellow-halos']
leafspots-marg :  ['dna' '?' 'w-s-marg' 'no-w-s-marg']
leafspot-size :  ['dna' '?' 'gt-1/8' 'lt-1/8']
leaf-shread :  ['absent' '?' 'present']
leaf-malf :  ['absent' '?' 'present']
leaf-mild :  ['absent' '?' 'upper-surf' 'lower-surf']
stem :  ['abnorm' 'norm' '?']
lodging :  ['no' 'yes' '?']
stem-cankers :  ['above-sec-nde' 'absent' 'below-soi

I will consider this '?' like unknown values just it

In [38]:
# Now change the attributes to numeric pattern using LabelEncoder
labelencoder = LabelEncoder()
for i in range(previsores.shape[1]):
    previsores[:,i] = labelencoder.fit_transform(previsores[:,i])

In [40]:
# Divisão da base de dados entre treinamento e teste (30% para testar e 70% para treinar)
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(previsores,
                                                                  classe,
                                                                  test_size = 0.3,
                                                                  random_state = 1)

#### Run attribute selector ExtraTreesClassifier

In [42]:
forest = ExtraTreesClassifier()
forest.fit(X_treinamento, y_treinamento)
importancias = forest.feature_importances_
importancias

array([0.04031678, 0.01714287, 0.03417743, 0.02591519, 0.02286531,
       0.0193398 , 0.0236472 , 0.02447941, 0.01956565, 0.01176654,
       0.0275621 , 0.02178092, 0.0380159 , 0.05529389, 0.05902085,
       0.02788785, 0.00993334, 0.026936  , 0.03683324, 0.01023657,
       0.03821308, 0.04491269, 0.03752514, 0.03382248, 0.00666818,
       0.04653053, 0.02056925, 0.0562368 , 0.04335014, 0.03052857,
       0.02385599, 0.0181879 , 0.01400766, 0.00894168, 0.02393303])

In [47]:
importancias[importancias > 0.035]

array([0.04031678, 0.0380159 , 0.05529389, 0.05902085, 0.03683324,
       0.03821308, 0.04491269, 0.03752514, 0.04653053, 0.0562368 ,
       0.04335014])

In [69]:
indices = np.where(importancias > 0.035)

In [61]:
indices[0]

array([ 0, 12, 13, 14, 18, 20, 21, 22, 25, 27, 28])

In [62]:
X_treinamento[:,indices[0]]

array([[3, 2, 3, ..., 3, 1, 3],
       [7, 2, 3, ..., 3, 1, 3],
       [6, 1, 1, ..., 3, 1, 2],
       ...,
       [3, 1, 1, ..., 2, 4, 4],
       [6, 2, 3, ..., 3, 4, 1],
       [3, 1, 1, ..., 3, 2, 4]], dtype=object)

#### Creating the first model using SVM

In [66]:
svm = SVC()
svm.fit(X_treinamento[:,indices[0]], y_treinamento)
previsoes = svm.predict(X_teste[:,indices[0]])
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

0.7853658536585366

In [67]:
# change the attribute revelvance to test
indices = np.where(importancias > 0.045)

In [68]:
svm = SVC()
svm.fit(X_treinamento[:,indices[0]], y_treinamento)
previsoes = svm.predict(X_teste[:,indices[0]])
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

0.6097560975609756

In [70]:
# Keep the first relevance rate
indices = np.where(importancias > 0.035)

In [78]:
svm = SVC()
svm.fit(X_treinamento, y_treinamento)
previsoes = svm.predict(X_teste)
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

0.8439024390243902

#### Creting the second model using Naive Bayes

In [72]:
naive_bayes = GaussianNB()
naive_bayes.fit(X_treinamento[:,indices[0]], y_treinamento)
previsoes = naive_bayes.predict(X_teste[:,indices[0]])
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

0.7804878048780488

In [77]:
naive_bayes = GaussianNB()
naive_bayes.fit(X_treinamento, y_treinamento)
previsoes = naive_bayes.predict(X_teste)
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

0.8439024390243902

#### Creting the third model using Ranom Forest 

In [75]:
floresta = RandomForestClassifier(n_estimators = 100)
floresta.fit(X_treinamento[:,indices[0]], y_treinamento)
previsoes = floresta.predict(X_teste[:,indices[0]])
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

0.8975609756097561

In [76]:
# Using all atributes
floresta = RandomForestClassifier(n_estimators = 100)
floresta.fit(X_treinamento, y_treinamento)
previsoes = floresta.predict(X_teste)
taxa_acerto = accuracy_score(y_teste, previsoes)
taxa_acerto

0.9414634146341463

### The best result was with Random Forest and all attributes