In [37]:
import pandas as pd
import numpy as np
import os
from pprint import pprint

from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, cross_val_score
import matplotlib.pyplot as plt


In [62]:
BASE_PATH = '/media/Education/BTech/S8/Projects/Project/Dataset'
DATA_FILE = 'dataset.csv'

FILE_PATH = os.path.join(BASE_PATH, DATA_FILE)

In [65]:
leaf_data = pd.read_csv(FILE_PATH, index_col=0)

train_set, test_set = train_test_split(leaf_data, test_size=0.2, random_state=42)
leaf_data.head()

Unnamed: 0,leafid,area,perimeter,physiological_length,physiological_width,aspect_ratio,rectangularity,circularity,mean_r,mean_g,mean_b,stddev_r,stddev_g,stddev_b,contrast,correlation,inverse_difference_moments,entropy,label
0,IMG_20200602_180156.jpg,4476798.0,17821.159711,2340,4160,0.5625,2.174411,70.942163,134.70294,115.104634,108.986437,26.107629,23.28682,23.583072,0.512578,0.999557,0.87279,7.514961,Tulsi
0,IMG_20200602_182942.jpg,4212655.5,16891.073488,4160,2340,1.777778,2.310751,67.726488,124.878271,107.684257,102.018447,31.813694,27.871602,27.094209,0.605756,0.999637,0.851295,7.954006,Tulsi
0,IMG_20200602_181446.jpg,408361.5,2896.833816,531,1161,0.457364,1.50967,20.549553,131.702823,112.889681,107.191281,24.42212,21.293975,22.341319,0.673855,0.999317,0.871159,7.414461,Tulsi
0,IMG_20200602_175538.jpg,4593147.5,17851.544487,2340,4160,0.5625,2.119331,69.381103,134.650933,114.61455,108.711329,24.456428,22.039913,21.664648,0.61249,0.999403,0.876848,7.425606,Tulsi
0,IMG_20200602_175626.jpg,4596812.0,18057.586118,2340,4160,0.5625,2.117642,70.935339,132.459199,113.308073,107.696607,23.681647,21.342995,21.012053,0.779036,0.999189,0.855932,7.547321,Tulsi


In [66]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        scaler = StandardScaler()
        return scaler.fit_transform(X[self.attribute_names])

In [67]:
to_plot, _ = train_test_split(train_set,test_size=0.7, random_state=42)

In [68]:
preprocess_pipeline = Pipeline([
        ("select_features", DataFrameSelector(["area","perimeter","physiological_width","physiological_length","rectangularity","aspect_ratio","circularity","contrast","correlation","mean_r","mean_g","mean_b","stddev_r","stddev_g","stddev_b","entropy","inverse_difference_moments"])),
])

In [69]:
X_train = preprocess_pipeline.fit_transform(train_set)
y_train = np.array(train_set["label"])

In [126]:
param_grid = [{'C': [0.1,1,10,100,1000], 'kernel': ['rbf'], 'gamma':[1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5]}, 
              {'C': [0.1,1,10,100,1000], 'kernel': ['rbf']}]
clf = GridSearchCV(SVC(), param_grid, cv=5)
clf = clf.fit(X_train, y_train)
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Best estimator found by grid search:
SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [127]:
svm_clf = clf

In [128]:
X_test = preprocess_pipeline.transform(test_set)
y_test = np.array(test_set["label"])
y_pred = svm_clf.predict(X_test)
print(y_pred)

['Tulsi' 'Chilli' 'Mango' 'Acasia' 'Acasia' 'Hibiscus' 'Acasia' 'Tulsi'
 'Guava' 'Guava' 'Acasia' 'Guava' 'Guava' 'Chilli' 'Hibiscus' 'Guava'
 'Guava' 'Hibiscus' 'Chilli' 'Mango' 'Guava' 'Acasia' 'Tulsi' 'Tulsi'
 'Guava' 'Tulsi' 'Tulsi' 'Guava' 'Chilli' 'Tulsi' 'Chilli' 'Acasia'
 'Hibiscus' 'Guava' 'Acasia' 'Tulsi' 'Guava' 'Mango' 'Chilli' 'Chilli'
 'Tulsi' 'Chilli' 'Guava' 'Tulsi' 'Mango' 'Hibiscus' 'Acasia' 'Hibiscus'
 'Chilli' 'Hibiscus' 'Tulsi' 'Hibiscus' 'Guava' 'Guava' 'Chilli']


In [129]:
print(X_test[0])

[-1.57246642 -1.38044218 -1.34559352 -1.35842686 -0.66955039 -0.60840023
 -0.62991673 -0.92350688 -0.86730013  0.14123855  0.2499897   0.46491468
 -1.24313361 -1.13465481 -1.1157185  -1.30042881  0.11221583]


In [130]:
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.9541125541125541

In [75]:
accuracy = svm_clf.score(X_test,y_test)
cm = confusion_matrix(y_test, y_pred)
cm


array([[ 6,  0,  0,  0,  0,  0],
       [ 1, 10,  0,  0,  0,  0],
       [ 0,  0, 14,  0,  0,  0],
       [ 0,  0,  0,  8,  0,  0],
       [ 1,  0,  0,  0,  4,  0],
       [ 0,  0,  0,  0,  0, 11]])

In [76]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Acasia       0.75      1.00      0.86         6
      Chilli       1.00      0.91      0.95        11
       Guava       1.00      1.00      1.00        14
    Hibiscus       1.00      1.00      1.00         8
       Mango       1.00      0.80      0.89         5
       Tulsi       1.00      1.00      1.00        11

    accuracy                           0.96        55
   macro avg       0.96      0.95      0.95        55
weighted avg       0.97      0.96      0.96        55



In [77]:
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)

forest_pred = forest_clf.predict(X_test)
forest_scores.mean()

0.9497835497835497

In [78]:
cm = confusion_matrix(y_test, forest_pred)
cm

array([[ 6,  0,  0,  0,  0,  0],
       [ 0, 10,  0,  0,  0,  1],
       [ 0,  0, 13,  1,  0,  0],
       [ 0,  0,  0,  8,  0,  0],
       [ 2,  0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  0, 11]])

In [79]:
kmeans = KMeans(n_clusters=15, random_state=0).fit(X_train)
k_pred = kmeans.predict(X_test)
print('kprred')
k_pred


kprred


array([14,  6, 12,  4,  5,  8,  9, 13,  1,  0,  9, 11, 11,  6,  8,  1, 11,
        8,  2,  5, 11,  4, 14, 14, 11,  3,  3,  0,  2, 13,  7,  9,  1,  1,
       12, 13,  1,  5,  6,  2,  7,  2,  1,  7,  0,  0,  2, 11, 10,  8, 14,
        8, 11,  1,  2], dtype=int32)

In [80]:
score = metrics.accuracy_score(y_test,kmeans.predict(X_test))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.000000


In [81]:
forest_pred = forest_clf.predict(X_test)
forest_pred

array(['Tulsi', 'Chilli', 'Acasia', 'Acasia', 'Acasia', 'Hibiscus',
       'Acasia', 'Tulsi', 'Guava', 'Guava', 'Acasia', 'Guava', 'Hibiscus',
       'Chilli', 'Hibiscus', 'Guava', 'Guava', 'Hibiscus', 'Chilli',
       'Mango', 'Guava', 'Acasia', 'Tulsi', 'Tulsi', 'Guava', 'Tulsi',
       'Tulsi', 'Guava', 'Chilli', 'Tulsi', 'Tulsi', 'Acasia', 'Hibiscus',
       'Guava', 'Acasia', 'Tulsi', 'Guava', 'Mango', 'Chilli', 'Chilli',
       'Tulsi', 'Chilli', 'Guava', 'Tulsi', 'Mango', 'Hibiscus', 'Chilli',
       'Hibiscus', 'Chilli', 'Hibiscus', 'Tulsi', 'Hibiscus', 'Guava',
       'Guava', 'Chilli'], dtype=object)

In [82]:
print('y_test')
pprint(y_test)

y_test
array(['Tulsi', 'Chilli', 'Mango', 'Acasia', 'Acasia', 'Hibiscus',
       'Acasia', 'Tulsi', 'Guava', 'Guava', 'Acasia', 'Guava', 'Guava',
       'Chilli', 'Hibiscus', 'Guava', 'Guava', 'Hibiscus', 'Chilli',
       'Mango', 'Guava', 'Acasia', 'Tulsi', 'Tulsi', 'Guava', 'Tulsi',
       'Tulsi', 'Guava', 'Chilli', 'Tulsi', 'Chilli', 'Mango', 'Hibiscus',
       'Guava', 'Acasia', 'Tulsi', 'Guava', 'Mango', 'Chilli', 'Chilli',
       'Tulsi', 'Chilli', 'Guava', 'Tulsi', 'Mango', 'Hibiscus', 'Chilli',
       'Hibiscus', 'Chilli', 'Hibiscus', 'Tulsi', 'Hibiscus', 'Guava',
       'Guava', 'Chilli'], dtype=object)


In [83]:
from sklearn.neighbors import KNeighborsClassifier 

In [119]:
knn = KNeighborsClassifier(n_neighbors = 1).fit(X_train, y_train) 

In [120]:
accuracy = knn.score(X_test, y_test) 
print (accuracy)

0.9454545454545454


In [86]:
knn_predictions = knn.predict(X_test)  
cm = confusion_matrix(y_test, knn_predictions)
cm

array([[ 4,  0,  0,  0,  1,  1],
       [ 0, 10,  0,  0,  1,  0],
       [ 0,  0, 13,  1,  0,  0],
       [ 0,  0,  0,  8,  0,  0],
       [ 2,  0,  0,  0,  3,  0],
       [ 2,  3,  0,  0,  0,  6]])

In [87]:
from sklearn.naive_bayes import GaussianNB

In [88]:
gnb = GaussianNB().fit(X_train, y_train) 
gnb_predictions = gnb.predict(X_test)

In [89]:
accuracy = gnb.score(X_test, y_test) 
print(accuracy)

0.8545454545454545


In [90]:
cm = confusion_matrix(y_test, gnb_predictions) 
cm

array([[ 5,  0,  0,  0,  1,  0],
       [ 3,  6,  0,  0,  0,  2],
       [ 0,  0, 14,  0,  0,  0],
       [ 0,  0,  0,  8,  0,  0],
       [ 2,  0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  0, 11]])

In [131]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [100, 200, 300]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [132]:
rf_random = RandomizedSearchCV(estimator = forest_clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [93]:
best_forest = rf_random.best_estimator_

In [94]:
best_forest.fit(X_train, y_train)
forest_scores = cross_val_score(best_forest, X_train, y_train, cv=10)

forest_pred = best_forest.predict(X_test)
forest_scores.mean()


0.9497835497835497

In [None]:
from joblib import dump
dump(best_forest, 'model.joblib')

cm = confusion_matrix(y_test, forest_pred)
cm

In [95]:
from sklearn.metrics import f1_score, precision_score, recall_score
f1_score(y_test, forest_pred, average="macro")

0.8880018934366759

In [96]:
precision_score(y_test, forest_pred, average="macro")

0.9111111111111111

In [97]:
recall_score(y_test, forest_pred, average="macro")

0.8943722943722944