# Preprocess and Feature Extraction - Flavia dataset

Extracted features are saved in file named "Flavia_features.csv"

In [1]:
# !pip install mahotas
import os
import cv2
import numpy as np
import pandas as pd
import mahotas as mt
from matplotlib import pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [8]:


ds_path = "dataset\Leaves"
img_files = os.listdir(ds_path)
print(ds_path)
print('file number :',len(img_files))

dataset\Leaves
file number : 1908


In [20]:
def create_dataset():
    names = ['area','perimeter','physiological_length','physiological_width','aspect_ratio','rectangularity','circularity', \
             'mean_r','mean_g','mean_b','stddev_r','stddev_g','stddev_b', \
             'contrast','correlation','inverse_difference_moments','entropy'
            ]
    df = pd.DataFrame([], columns=names)
    for file in tqdm(img_files):
        imgpath = ds_path +'\\'+ file

        main_img = cv2.imread(imgpath)
        
        #Preprocessing
        img = cv2.cvtColor(main_img, cv2.COLOR_BGR2RGB)
        gs = cv2.cvtColor(img,cv2.COLOR_RGB2GRAY)
        

        blur = cv2.GaussianBlur(gs, (25,25),0)
        ret_otsu,im_bw_otsu = cv2.threshold(blur,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
        kernel = np.ones((50,50),np.uint8)
        closing = cv2.morphologyEx(im_bw_otsu, cv2.MORPH_CLOSE, kernel)
        
        #Shape features
        contours, _ = cv2.findContours(closing,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
        cnt = contours[0]
        M = cv2.moments(cnt)
        area = cv2.contourArea(cnt)
        perimeter = cv2.arcLength(cnt,True)
        x,y,w,h = cv2.boundingRect(cnt)
        aspect_ratio = float(w)/h
        if area ==0:
            continue
        rectangularity = w*h/area
        circularity = ((perimeter)**2)/area
        
        #Color features
        red_channel = img[:,:,0]
        green_channel = img[:,:,1]
        blue_channel = img[:,:,2]
        blue_channel[blue_channel == 255] = 0
        green_channel[green_channel == 255] = 0
        red_channel[red_channel == 255] = 0
        
        red_mean = np.mean(red_channel)
        green_mean = np.mean(green_channel)
        blue_mean = np.mean(blue_channel)
        
        red_std = np.std(red_channel)
        green_std = np.std(green_channel)
        blue_std = np.std(blue_channel)
        
        #Texture features
        textures = mt.features.haralick(gs)
        ht_mean = textures.mean(axis=0)
        contrast = ht_mean[ 1]
        correlation = ht_mean[2]
        inverse_diff_moments = ht_mean[4]
        entropy = ht_mean[8]
        
        vector = [area,perimeter,w,h,aspect_ratio,rectangularity,circularity,\
                  red_mean,green_mean,blue_mean,red_std,green_std,blue_std,\
                  contrast,correlation,inverse_diff_moments,entropy
                 ]
        
        df_temp = pd.DataFrame([vector],columns=names)
        df = pd.concat([df,df_temp])
 
    return df

In [21]:
dataset = create_dataset()
dataset.to_csv("Flavia_features.csv")

  df = pd.concat([df,df_temp])
  6%|████▉                                                                          | 120/1908 [00:11<02:47, 10.69it/s]


KeyboardInterrupt: 

# Preprocess and Feature Extraction - Flavia dataset

Extracted features are saved in file named "leaf_snap_features.csv"

In [22]:
ds_path = ""
# ds_path =''
# img_files = os.listdir(ds_path)
dat = pd.read_csv('leafsnap-dataset-train-images_224_svm.csv')
img_files = dat['image_paths'].to_list()
targets = dat['species'].to_list()
dat.head()

Unnamed: 0.1,Unnamed: 0,image_paths,species
0,0,dataset/train_224_svm/ptelea_trifoliata\1.jpg,Ptelea trifoliata
1,1,dataset/train_224_svm/pinus_virginiana\2.jpg,Pinus virginiana
2,2,dataset/train_224_svm/magnolia_grandiflora\3.jpg,Magnolia grandiflora
3,3,dataset/train_224_svm/albizia_julibrissin\4.jpg,Albizia julibrissin
4,4,dataset/train_224_svm/salix_matsudana\5.jpg,Salix matsudana


In [23]:
def create_dataset():
    names = ['area','perimeter','physiological_length','physiological_width','aspect_ratio','rectangularity','circularity', \
             'mean_r','mean_g','mean_b','stddev_r','stddev_g','stddev_b', \
             'contrast','correlation','inverse_difference_moments','entropy'
            ]
    df = pd.DataFrame([], columns=names)
    for file in img_files:
        imgpath = ds_path + file

        main_img = cv2.imread(imgpath)
        
        #Preprocessing
        img = cv2.cvtColor(main_img, cv2.COLOR_BGR2RGB)
        gs = cv2.cvtColor(img,cv2.COLOR_RGB2GRAY)
        blur = cv2.GaussianBlur(gs, (25,25),0)
        ret_otsu,im_bw_otsu = cv2.threshold(blur,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
        kernel = np.ones((50,50),np.uint8)
        closing = cv2.morphologyEx(im_bw_otsu, cv2.MORPH_CLOSE, kernel)
        
        #Shape features
        contours, _ = cv2.findContours(closing,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
        cnt = contours[0]
        M = cv2.moments(cnt)
        area = cv2.contourArea(cnt)
        perimeter = cv2.arcLength(cnt,True)
        x,y,w,h = cv2.boundingRect(cnt)
        aspect_ratio = float(w)/h
        if area ==0:
            continue
        rectangularity = w*h/area
        circularity = ((perimeter)**2)/area
        
        #Color features
        red_channel = img[:,:,0]
        green_channel = img[:,:,1]
        blue_channel = img[:,:,2]
        blue_channel[blue_channel == 255] = 0
        green_channel[green_channel == 255] = 0
        red_channel[red_channel == 255] = 0
        
        red_mean = np.mean(red_channel)
        green_mean = np.mean(green_channel)
        blue_mean = np.mean(blue_channel)
        
        red_std = np.std(red_channel)
        green_std = np.std(green_channel)
        blue_std = np.std(blue_channel)
        
        #Texture features
        textures = mt.features.haralick(gs)
        ht_mean = textures.mean(axis=0)
        contrast = ht_mean[1]
        correlation = ht_mean[2]
        inverse_diff_moments = ht_mean[4]
        entropy = ht_mean[8]
        
        vector = [area,perimeter,w,h,aspect_ratio,rectangularity,circularity,\
                  red_mean,green_mean,blue_mean,red_std,green_std,blue_std,\
                  contrast,correlation,inverse_diff_moments,entropy
                 ]
        
        df_temp = pd.DataFrame([vector],columns=names)
        df = pd.concat([df,df_temp])
        print(file)
    return df

In [24]:
from tqdm import tqdm

# dataset = create_dataset()
names = ['area','perimeter','physiological_length','physiological_width','aspect_ratio','rectangularity','circularity', \
         'mean_r','mean_g','mean_b','stddev_r','stddev_g','stddev_b', \
         'contrast','correlation','inverse_difference_moments','entropy', 'target'
        ]
df = pd.DataFrame([], columns=names)
for file,target in tqdm(zip(img_files,targets)):
    imgpath = ds_path + file

    main_img = cv2.imread(imgpath)

    #Preprocessing
    img = cv2.cvtColor(main_img, cv2.COLOR_BGR2RGB)
    gs = cv2.cvtColor(img,cv2.COLOR_RGB2GRAY)
    blur = cv2.GaussianBlur(gs, (25,25),0)
    ret_otsu,im_bw_otsu = cv2.threshold(blur,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
    kernel = np.ones((50,50),np.uint8)
    closing = cv2.morphologyEx(im_bw_otsu, cv2.MORPH_CLOSE, kernel)

    #Shape features
    contours, _ = cv2.findContours(closing,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
    try:
        cnt = contours[0]
    except:
        continue
    M = cv2.moments(cnt)
    area = cv2.contourArea(cnt)
    perimeter = cv2.arcLength(cnt,True)
    x,y,w,h = cv2.boundingRect(cnt)
    aspect_ratio = float(w)/h
    if area ==0:
        continue
    rectangularity = w*h/area
    circularity = ((perimeter)**2)/area

    #Color features
    red_channel = img[:,:,0]
    green_channel = img[:,:,1]
    blue_channel = img[:,:,2]
    blue_channel[blue_channel == 255] = 0
    green_channel[green_channel == 255] = 0
    red_channel[red_channel == 255] = 0

    red_mean = np.mean(red_channel)
    green_mean = np.mean(green_channel)
    blue_mean = np.mean(blue_channel)

    red_std = np.std(red_channel)
    green_std = np.std(green_channel)
    blue_std = np.std(blue_channel)

    #Texture features
    textures = mt.features.haralick(gs)
    ht_mean = textures.mean(axis=0)
    contrast = ht_mean[1]
    correlation = ht_mean[2]
    inverse_diff_moments = ht_mean[4]
    entropy = ht_mean[8]

    vector = [area,perimeter,w,h,aspect_ratio,rectangularity,circularity,\
              red_mean,green_mean,blue_mean,red_std,green_std,blue_std,\
              contrast,correlation,inverse_diff_moments,entropy,target
             ]

    df_temp = pd.DataFrame([vector],columns=names)
    df = pd.concat([df,df_temp])
    # print(file)

  df = pd.concat([df,df_temp])
334it [00:03, 103.98it/s]


KeyboardInterrupt: 

In [36]:

dataset = df

dataset.to_csv("leaf_snap_features.csv")

In [37]:
dataset.shape

(23585, 18)

pandas.core.frame.DataFrame

485129.5

Unnamed: 0.1,Unnamed: 0,area,perimeter,physiological_length,physiological_width,aspect_ratio,rectangularity,circularity,mean_r,mean_g,mean_b,stddev_r,stddev_g,stddev_b,contrast,correlation,inverse_difference_moments,entropy
0,0,485129.5,3822.131487,1459,905,1.612155,2.721737,30.112968,21.548321,38.921378,10.541783,41.316764,67.891852,26.448052,55.779676,0.992673,0.774745,4.046708
1,0,489890.0,3809.143686,1422,945,1.504762,2.743044,29.618028,11.758990,50.206020,2.000756,25.948078,84.915973,17.924460,12.757222,0.998075,0.810892,3.549434
2,0,307903.5,2569.731221,1114,394,2.827411,1.425499,21.446715,10.248443,19.695162,16.331825,45.721887,48.443413,60.486853,8.076081,0.999279,0.860258,2.823292
3,0,245870.5,2813.824586,1264,327,3.865443,1.681080,32.202354,14.705938,20.883349,13.306845,40.661030,52.649291,38.531062,4.589367,0.999007,0.899268,2.009211
4,0,69526.5,3704.076776,1448,964,1.502075,20.076834,197.337487,7.405359,9.433330,5.798306,34.457877,43.523617,30.145214,1.927087,0.997009,0.960461,0.865603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,0,461140.0,6378.520442,1274,1101,1.157130,3.041753,88.228137,23.783565,36.293105,21.694625,45.075537,64.668263,41.827096,28.238168,0.995964,0.795875,3.599582
1903,0,637829.5,4121.739572,1191,1055,1.128910,1.969970,26.635233,28.881916,46.498822,23.016589,44.130072,65.783191,37.395361,26.604019,0.997254,0.716242,4.674605
1904,0,371950.5,3847.376212,1460,967,1.509824,3.795720,39.796435,18.641599,34.809681,9.775415,40.488711,69.770100,26.659364,21.478996,0.995827,0.825369,3.049712
1905,0,764912.5,4271.960790,1141,1014,1.125247,1.512557,23.858479,34.912162,54.138055,26.549748,45.961112,66.759222,37.250674,23.082022,0.997802,0.678819,5.431301


1907