In [1]:
import pandas as pd
import numpy as np
import os
from pprint import pprint

from sklearn.svm import SVC
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [4]:
BASE_PATH = '/media/Education/BTech/S8/Projects/Project/Data'
DATA_FILE = 'LeafFeaturesFull.csv'

FILE_PATH = os.path.join(BASE_PATH, DATA_FILE)

In [5]:
leaf_data = pd.read_csv(FILE_PATH)

train_set, test_set = train_test_split(leaf_data, test_size=0.2, random_state=42)
leaf_data.head()

Unnamed: 0,leafid,area,perimeter,physiological_length,physiological_width,aspect_ratio,rectangularity,circularity,mean_r,mean_g,mean_b,stddev_r,stddev_g,stddev_b,contrast,correlation,inverse_difference_moments,entropy,label
0,l13nr041.tif,212434.0,2147.630716,507,787,0.644219,1.878273,21.711768,205.203949,198.449554,182.10636,75.035519,80.454618,107.95523,48.014415,0.996364,0.440747,8.533301,leaf13
1,l13nr039.tif,588228.5,3411.558769,938,1111,0.844284,1.771621,19.786075,178.481571,199.651422,149.900627,98.256304,80.485369,117.653037,41.486664,0.996943,0.456699,8.548801,leaf13
2,l13nr018.tif,273327.5,2322.324149,615,798,0.770677,1.795538,19.731602,194.291568,204.696105,154.159374,89.705615,76.966208,118.547042,40.70251,0.996787,0.464026,8.22802,leaf13
3,l13nr044.tif,217065.0,1986.625667,536,703,0.762447,1.735922,18.182026,189.931495,205.16995,141.331804,92.54496,76.927751,119.668945,44.818226,0.996362,0.464579,8.185278,leaf13
4,l13nr072.tif,456021.0,2956.723209,729,1061,0.687088,1.696126,19.170635,185.608041,202.900465,157.068775,93.326375,74.288455,115.498335,39.854965,0.996732,0.449347,8.695268,leaf13


In [6]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [8]:
to_plot, _ = train_test_split(train_set,test_size=0.7, random_state=42)
to_plot.to_csv('small_set.csv', index=False)

In [9]:
preprocess_pipeline = Pipeline([
        ("select_features", DataFrameSelector(["area","perimeter","physiological_width","physiological_length","rectangularity","aspect_ratio","circularity","contrast","correlation","mean_r","mean_g","mean_b","stddev_r","stddev_g","stddev_b","entropy","inverse_difference_moments"])),
])

In [10]:
X_train = preprocess_pipeline.fit_transform(train_set)
y_train = train_set["label"]

In [11]:
svm_clf = SVC(gamma=0.0001, C=100, decision_function_shape='ovo')
svm_clf.fit(X_train, y_train)

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
X_test = preprocess_pipeline.transform(test_set)
y_test = test_set["label"]
y_pred = svm_clf.predict(X_test)

In [27]:
print(X_test)

           area     perimeter  physiological_width  physiological_length  \
1089   694761.0   3932.266483                 1541                   824   
1103   470203.0   2783.316137                 1059                   669   
739   1985580.5  21782.323728                 2799                  1650   
140   1104910.0   6323.438049                 1808                   997   
1018   403509.0   2848.851059                  848                   752   
...         ...           ...                  ...                   ...   
67     593073.0   3347.825794                 1087                   946   
336    642351.5   5185.504073                 1770                   893   
429    831887.5   4194.872478                 1569                   840   
12     328377.0   2432.591172                  766                   720   
1034   529824.0   3103.904883                  941                   858   

      rectangularity  aspect_ratio  circularity   contrast  correlation  \
1089        

In [13]:
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.09222222222222223

In [12]:
accuracy = svm_clf.score(X_test,y_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 19],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17],
       [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0, 12],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  9],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 16],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0, 16],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 17],
       [ 0,  0,  0,  0,  0,  0,  0

In [14]:
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)

forest_pred = forest_clf.predict(X_test)
forest_scores.mean()

0.9177777777777779

In [15]:
cm = confusion_matrix(y_test, forest_pred)
cm

array([[16,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0],
       [ 0, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0],
       [ 0,  0,  0,  3, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 13,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 13,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0,  0,  0,  0,  0,  0,  0,  7,  0,  0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 16,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0, 16,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13,  0,  0],
       [ 0,  0,  0,  4,  0,  0,  0,  0,  0,  1,  0,  0,  0, 13,  0],
       [ 1,  0,  0,  0,  0,  0,  1

In [15]:
kmeans = KMeans(n_clusters=15, random_state=0).fit(X_train)
k_pred = kmeans.predict(X_test)
print('kprred')
k_pred


kprred


array([ 2,  4,  9,  7, 10, 10,  4, 10,  4,  5, 14, 10, 13, 10, 10,  1, 12,
       12,  2,  6,  4,  4,  4,  4,  8, 12,  4,  2,  2,  2,  6, 14, 10,  6,
       13,  0,  1, 14,  4,  6, 10,  0,  8, 14, 14,  8,  8, 12,  0, 14, 13,
       10,  0,  0,  0,  4,  6, 10, 13,  2,  4,  0,  0, 12,  7, 12,  4,  8,
        4,  8,  8,  8,  8,  1,  6,  6,  6, 12, 10, 14, 12, 14, 10, 12,  2,
        6, 14,  2, 12,  6,  8, 12,  1,  0, 14,  2,  2,  4,  2, 12, 13,  0,
       12,  7,  0, 14, 10,  5,  5,  4, 10, 13,  4,  4,  2,  0, 10, 14,  0,
        0,  4,  8,  0,  8, 12, 14,  2,  6,  7,  5,  4,  7, 13,  6,  8,  2,
       13,  0, 10, 12,  3, 14, 10,  1, 10,  4,  2,  8, 13,  2,  6, 14,  2,
        0,  7, 14, 12,  2,  4,  4,  2,  7,  9,  0,  6,  2, 12, 10, 10,  4,
        6,  8,  0,  6,  3, 14, 13,  2, 10, 13, 14,  0,  7,  4, 14,  4, 14,
       11,  0, 14,  2,  8,  9,  0, 14, 12,  2,  0, 10,  7, 14, 12, 12,  2,
        2, 12, 14, 10,  5,  0, 11,  8, 10,  8,  6,  8,  6,  2,  0,  8, 14,
       14,  8, 10,  4], d

In [16]:
score = metrics.accuracy_score(y_test,kmeans.predict(X_test))
print('Accuracy:{0:f}'.format(score))

Accuracy:0.000000


In [17]:
forest_pred = forest_clf.predict(X_test)
forest_pred

array(['leaf15', 'leaf15', 'leaf10', 'leaf14', 'leaf12', 'leaf6',
       'leaf11', 'leaf4', 'leaf15', 'leaf2', 'leaf1', 'leaf11', 'leaf2',
       'leaf4', 'leaf13', 'leaf1', 'leaf7', 'leaf5', 'leaf3', 'leaf12',
       'leaf1', 'leaf11', 'leaf11', 'leaf4', 'leaf1', 'leaf8', 'leaf12',
       'leaf13', 'leaf14', 'leaf8', 'leaf1', 'leaf15', 'leaf4', 'leaf14',
       'leaf10', 'leaf7', 'leaf1', 'leaf14', 'leaf11', 'leaf14', 'leaf11',
       'leaf4', 'leaf12', 'leaf12', 'leaf12', 'leaf2', 'leaf3', 'leaf5',
       'leaf13', 'leaf4', 'leaf10', 'leaf1', 'leaf11', 'leaf7', 'leaf15',
       'leaf4', 'leaf6', 'leaf6', 'leaf10', 'leaf6', 'leaf13', 'leaf4',
       'leaf7', 'leaf7', 'leaf9', 'leaf7', 'leaf11', 'leaf3', 'leaf14',
       'leaf3', 'leaf6', 'leaf14', 'leaf10', 'leaf2', 'leaf9', 'leaf9',
       'leaf12', 'leaf5', 'leaf12', 'leaf14', 'leaf5', 'leaf12', 'leaf1',
       'leaf5', 'leaf12', 'leaf12', 'leaf11', 'leaf6', 'leaf7', 'leaf9',
       'leaf1', 'leaf5', 'leaf1', 'leaf8', 'leaf8', 'leaf

In [18]:
print('y_test')
pprint(y_test)

y_test
1089    leaf15
1103    leaf15
739     leaf10
140     leaf14
1018    leaf12
         ...  
67      leaf13
336      leaf8
429      leaf1
12      leaf13
1034    leaf12
Name: label, Length: 225, dtype: object


In [19]:
from sklearn.neighbors import KNeighborsClassifier 

In [20]:
knn = KNeighborsClassifier(n_neighbors = 20).fit(X_train, y_train) 

In [21]:
accuracy = knn.score(X_test, y_test) 
print (accuracy)

0.26666666666666666


In [22]:
knn_predictions = knn.predict(X_test)  
cm = confusion_matrix(y_test, knn_predictions)
cm

array([[0, 1, 0, 1, 1, 4, 0, 1, 4, 0, 0, 0, 0, 1, 4],
       [0, 8, 0, 0, 0, 0, 0, 5, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 5, 1, 2, 2, 3, 0, 0, 3, 0, 0, 2, 1, 0],
       [0, 0, 3, 0, 0, 0, 1, 0, 9, 1, 0, 2, 0, 0, 1],
       [0, 0, 1, 0, 4, 1, 1, 0, 2, 3, 0, 0, 5, 1, 0],
       [0, 1, 2, 2, 0, 0, 1, 0, 4, 0, 0, 3, 0, 1, 3],
       [0, 0, 0, 2, 1, 3, 4, 0, 3, 0, 0, 0, 0, 0, 0],
       [0, 2, 0, 0, 0, 0, 1, 5, 2, 0, 0, 1, 0, 0, 2],
       [0, 0, 0, 1, 0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0],
       [0, 0, 6, 1, 0, 0, 1, 0, 1, 5, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 9, 0, 5, 0, 0],
       [1, 0, 0, 0, 1, 1, 2, 0, 6, 2, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 9, 0, 0],
       [0, 0, 1, 2, 0, 0, 2, 0, 3, 1, 1, 4, 1, 2, 1],
       [0, 0, 0, 0, 0, 3, 1, 1, 3, 0, 0, 0, 0, 0, 1]])

In [23]:
from sklearn.naive_bayes import GaussianNB

In [24]:
gnb = GaussianNB().fit(X_train, y_train) 
gnb_predictions = gnb.predict(X_test)

In [25]:
accuracy = gnb.score(X_test, y_test) 
print(accuracy)

0.5822222222222222


In [26]:
cm = confusion_matrix(y_test, gnb_predictions) 
cm

array([[ 0,  0,  2,  0,  0,  0,  1,  0,  7,  0,  0,  0,  0,  0,  7],
       [ 0, 13,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 13,  0,  0,  0,  3,  0,  0,  0,  3,  0,  0,  0,  0],
       [ 0,  0,  0,  8,  1,  0,  1,  0,  1,  0,  0,  0,  0,  5,  1],
       [ 0,  0,  0,  2, 11,  0,  0,  0,  0,  0,  0,  0,  0,  5,  0],
       [ 0,  1,  0,  0,  0, 11,  1,  0,  2,  0,  0,  1,  0,  0,  1],
       [ 0,  0,  1,  0,  0,  0,  8,  0,  4,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 13,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  6,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  4,  0,  2,  0,  0,  0,  3,  7,  0,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  0,  0,  0,  0,  0,  0, 14,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 10,  0,  1,  0,  0,  3,  0,  0,  3],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0, 12,  0,  0],
       [ 1,  0,  0,  3,  2,  0,  1,  0,  1,  0,  1,  0,  0,  8,  1],
       [ 0,  0,  0,  0,  0,  0,  1

In [16]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [100, 200, 300]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [17]:
rf_random = RandomizedSearchCV(estimator = forest_clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.4min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [18]:
best_forest = rf_random.best_estimator_

In [20]:
best_forest.fit(X_train, y_train)
forest_scores = cross_val_score(best_forest, X_train, y_train, cv=10)

forest_pred = best_forest.predict(X_test)
forest_scores.mean()


0.9222222222222223

In [22]:
from joblib import dump
dump(best_forest, 'model.joblib')

['model.joblib']

cm = confusion_matrix(y_test, forest_pred)
cm

In [62]:
from sklearn.metrics import f1_score, precision_score, recall_score
f1_score(y_test, forest_pred, average="macro")

0.9190010681677347

In [61]:
precision_score(y_test, forest_pred, average="macro")

0.927696122944474

In [60]:
recall_score(y_test, forest_pred, average="macro")

0.9180598555211558