### Data munging
Creates a csv file from info on the image data.

In [1]:
# general imports
import os, glob, fnmatch
import pandas as pd
import numpy as np

# image processing imports
import cv2

---
## Create dataframe
Creates a dataframe from RGB, HSV, image height & width image data using OpenCV methods and `.shape`.

In [2]:
# create list of all the image filepaths
images = []
for root, dirnames, filenames in os.walk('/Users/VanessaG/Desktop/pizza_class_data/'):
    for filename in fnmatch.filter(filenames, '*.jpg'):
        images.append(os.path.join(root, filename))

In [None]:
# create lists for dataframe of image info - rgb, hsv, image height & width
data = []
for img in images:
    image = cv2.imread(img)
    data.append(cv2.normalize(image, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F))
    
rgb_means = []
hsv_means = []
img_height = []
img_width = [] 

for i in data:
    means = cv2.mean(cv2.cvtColor(i, cv2.COLOR_BGR2RGB))[:3]
    rgb_means.append(means)
    means2 = cv2.mean(cv2.cvtColor(i, cv2.COLOR_BGR2HSV))[:3]
    hsv_means.append(means2)
    img_height.append(i.shape[0])
    img_width.append(i.shape[1])

In [None]:
# create dataframes from the lists above
df1 = pd.DataFrame(rgb_means, columns=['red', 'green', 'blue'])
df2 = pd.DataFrame(hsv_means, columns=['hue', 'sat', 'val'])
df = pd.concat([df1, df2], axis=1)

In [5]:
# just nice to have - used to display images in EDA
df['full_path'] = images

In [6]:
# 0 is not pizza, 1 is pizza
df['label'] = df.full_path.map(lambda x: 0 if '/not_pizza/' in x else 1)

In [7]:
# basically confirming labels are correct
df['short_path'] = df['full_path'].replace({'/Users/VanessaG/Desktop/pizza_class_data/': ''}, regex=True)

In [8]:
# add in width, height, total pixels and re-order columns
df['img_height'] = img_height
df['img_width'] = img_width
df['total_px'] = df.img_height * df.img_width
df = df[['label', 'red', 'green', 'blue','hue', 'sat', 'val', 'img_height', 'img_width', 'total_px', 'short_path', 'full_path']]

In [9]:
df.describe()

Unnamed: 0,label,red,green,blue,hue,sat,val,img_height,img_width,total_px
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,0.5,0.553316,0.427399,0.319187,77.235804,0.494648,0.567891,446.7985,491.44775,218847.316
std,0.500063,0.124527,0.107539,0.120633,43.050021,0.15731,0.120962,75.907809,47.64875,40589.591391
min,0.0,0.12003,0.062316,0.007203,14.988055,0.109173,0.120529,195.0,280.0,58500.0
25%,0.0,0.470093,0.360606,0.236348,44.37904,0.37947,0.489349,375.0,500.0,187500.0
50%,0.5,0.55229,0.428084,0.317322,67.014925,0.479005,0.56774,512.0,512.0,196608.0
75%,1.0,0.638385,0.494061,0.398177,100.075006,0.59908,0.648782,512.0,512.0,262144.0
max,1.0,0.940668,0.816286,0.758213,312.255124,0.988027,0.940784,639.0,800.0,480000.0


In [10]:
# save csv
df.to_csv('../data/image_info.csv')