<a href="https://colab.research.google.com/github/vizhnu/PlantId/blob/master/preprocess_extract_dataset_flavia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocess and Feature Extraction - Flavia dataset

Extracted features are saved in file named "Flavia_features.csv"

In [0]:
pip install mahotas

Collecting mahotas
[?25l  Downloading https://files.pythonhosted.org/packages/84/74/bd38163462eb346519f36dc205f0a52a01fb372c7bbcc87392c9b21cfe26/mahotas-1.4.9.tar.gz (1.5MB)
[K     |▏                               | 10kB 17.3MB/s eta 0:00:01[K     |▍                               | 20kB 1.7MB/s eta 0:00:01[K     |▋                               | 30kB 2.2MB/s eta 0:00:01[K     |▉                               | 40kB 1.6MB/s eta 0:00:01[K     |█                               | 51kB 1.8MB/s eta 0:00:01[K     |█▎                              | 61kB 2.2MB/s eta 0:00:01[K     |█▌                              | 71kB 2.3MB/s eta 0:00:01[K     |█▊                              | 81kB 2.5MB/s eta 0:00:01[K     |██                              | 92kB 2.8MB/s eta 0:00:01[K     |██▏                             | 102kB 2.6MB/s eta 0:00:01[K     |██▍                             | 112kB 2.6MB/s eta 0:00:01[K     |██▋                             | 122kB 2.6MB/s eta 0:00:01[K 

In [0]:
import os
import cv2
import numpy as np
import pandas as pd
import mahotas as mt
from matplotlib import pyplot as plt
%matplotlib inline
import zipfile
from google.colab import files

In [0]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [0]:
with zipfile.ZipFile('My Drive/LeafData.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')

In [0]:
ds_path = "/content/LeafData"
img_files = os.listdir(ds_path)

In [0]:
def create_dataset():
    names = ['leafid','area','perimeter','physiological_length','physiological_width','aspect_ratio','rectangularity','circularity', \
             'mean_r','mean_g','mean_b','stddev_r','stddev_g','stddev_b', \
             'contrast','correlation','inverse_difference_moments','entropy', 'label'
            ]
    df = pd.DataFrame([], columns=names)
    for folder in img_files:
      base_folder = ds_path + '/' + folder
      for file in os.listdir(base_folder):
        imgpath = base_folder + '/' +file
        main_img = cv2.imread(imgpath)
        leafid = file
        label = folder

        #Preprocessing
        img = cv2.cvtColor(main_img, cv2.COLOR_BGR2RGB)
        gs = cv2.cvtColor(img,cv2.COLOR_RGB2GRAY)
        blur = cv2.GaussianBlur(gs, (25,25),0)
        ret_otsu,im_bw_otsu = cv2.threshold(blur,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
        kernel = np.ones((50,50),np.uint8)
        closing = cv2.morphologyEx(im_bw_otsu, cv2.MORPH_CLOSE, kernel)
        
        #Shape features
        contours, image = cv2.findContours(closing,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
        index = 0
        #find the biggest contour to avoid stem contours that appear sometimes
        if len(contours) > 1:
          list_len = [len(c) for c in contours]
          index = list_len.index(max(list_len))

        cnt = contours[index]
        M = cv2.moments(cnt)
        area = cv2.contourArea(cnt)
        perimeter = cv2.arcLength(cnt,True)
        x,y,w,h = cv2.boundingRect(cnt)
        aspect_ratio = float(w)/h
        rectangularity = w*h/area
        circularity = ((perimeter)**2)/area
        
        #Color features
        red_channel = img[:,:,0]
        green_channel = img[:,:,1]
        blue_channel = img[:,:,2]
        blue_channel[blue_channel == 255] = 0
        green_channel[green_channel == 255] = 0
        red_channel[red_channel == 255] = 0
        
        red_mean = np.mean(red_channel)
        green_mean = np.mean(green_channel)
        blue_mean = np.mean(blue_channel)
        
        red_std = np.std(red_channel)
        green_std = np.std(green_channel)
        blue_std = np.std(blue_channel)
        
        #Texture features
        textures = mt.features.haralick(gs)
        ht_mean = textures.mean(axis=0)
        contrast = ht_mean[1]
        correlation = ht_mean[2]
        inverse_diff_moments = ht_mean[4]
        entropy = ht_mean[8]
        
        vector = [leafid,area,perimeter,w,h,aspect_ratio,rectangularity,circularity,\
                  red_mean,green_mean,blue_mean,red_std,green_std,blue_std,\
                  contrast,correlation,inverse_diff_moments,entropy, label
                 ]
        
        df_temp = pd.DataFrame([vector],columns=names)
        df = df.append(df_temp)
        #print(file)
    return df

In [0]:
dataset = create_dataset()

In [0]:
dataset.shape

(300, 19)

In [0]:
type(dataset)

pandas.core.frame.DataFrame

In [0]:
dataset.to_csv("/content/LeafFeatures.csv")

In [0]:
%ls /content

[0m[01;34mLeafData[0m/  LeafFeatures.csv  [01;34msample_data[0m/


In [0]:
files.download('/content/LeafFeatures.csv')