In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import os
from pathlib import Path

In [None]:
DATA_FOLDER = Path("../input/fruits/fruits-360_dataset/fruits-360")
TRAIN = DATA_FOLDER / 'Training'

In [None]:
for i,fruit_class in enumerate(TRAIN.iterdir()):
    print(fruit_class.stem)
    if i > 5: break

In [None]:
from tqdm import tqdm
from PIL import Image
import random

def load_data(path, sample=30, shuffle=True, get_class=None):
    res = []
    for fruit_class in tqdm(path.iterdir()):
        if get_class:
            if fruit_class.stem != get_class: continue
        for i,image in enumerate(fruit_class.iterdir()):
            res.append((np.array(Image.open(image)),
                        fruit_class.stem,
                        image.stem))
            if i > sample:
                break
    if shuffle:
        random.shuffle(res)
            
    return res

train = load_data(TRAIN)
# load_data(TRAIN,get_class='Eggplant')

In [None]:
fig,axes = plt.subplots(4,4,figsize=(15,15))
imgs,labels,fnames = zip(*train)

for img,label,fname,ax in zip(imgs,labels,fnames,axes.ravel()):
    plt.sca(ax)
    plt.imshow(img)
    plt.axis('off')
    plt.title(label + " : " + fname )
plt.show()

In [None]:
import copy
labels_copy = list(set(copy.deepcopy(labels)))
random.shuffle(labels_copy)
selected_labels = labels_copy[:20]

In [None]:
labels[:10]

In [None]:
from collections import defaultdict
acc = defaultdict(list)
for image, label in zip(imgs,labels):
    if label in selected_labels:
        acc[label].append(image.mean((0,1),where=image!=255))

acc = {k:np.array(v) for k,v in acc.items()}

In [None]:
acc.keys()

In [None]:
import matplotlib.pyplot as plt
import random

def unstack(a, axis = 0):
    return [np.squeeze(e, axis) for e in np.split(a, a.shape[axis], axis = axis)]

fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(projection='3d')

for name, rgb_vals in acc.items():
    r,g,b = unstack(rgb_vals,1)
    c = np.array([r,g,b]).T / 255.
    ax.scatter(
        r,g,b,
        c = c,
        label=name
    )
plt.title('3D projection of mean RGB values')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import random

def unstack(a, axis = 0):
    return [np.squeeze(e, axis) for e in np.split(a, a.shape[axis], axis = axis)]

fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot()

for name, rgb_vals in acc.items():
    r,g,b = unstack(rgb_vals,1)
    c = np.array([r,g,b]).T / 255.
    ax.scatter(
        r,(g+b)/2,
        c = c,
        label=name
    )
    plt.xlabel('Mean Red Channel')
    plt.ylabel('Mean Avg(Green,Blue) Channel')
plt.legend()
plt.title('Mean separation quality by channel value')
plt.show()

In [None]:
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

NUM_CLASSES = len(set(labels))
CLASSES = list(set(labels))
def get_trainable_arrays(train):
    images, labels, _ = zip(*train)
    all_labels = list(set(labels))
    labels_one_hot = [all_labels.index(l) for l in labels]
    labels_one_hot = tf.one_hot(labels_one_hot, NUM_CLASSES)
    images = np.array(images)
    return images, labels_one_hot

x_train, y_train = get_trainable_arrays(train)

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns

def data_flatten(x_train):
    return x_train.reshape(x_train.shape[0],-1) / 255.

def get_pcas(n_cats=4):
    cats = list(set(labels))[:n_cats]
    acc = []
    pca = PCA(2)
    pca.fit(data_flatten(x_train[:500]))
    for cat in cats:
        x_cat,_ = get_trainable_arrays(load_data(TRAIN,get_class=cat))
        x_cat_pca = pca.transform(data_flatten(x_cat))
        for x,y in x_cat_pca:
            acc.append([cat, x, y])
    df = pd.DataFrame(acc, columns=['Category','x','y'])
    return df

df = get_pcas(4)

In [None]:
plt.figure(figsize=(10,10))
sns.kdeplot(
    data=df, x="x", y="y", hue="Category", fill=True,
)

plt.title('PCA Separation Quality')
plt.xticks([])
plt.yticks([])
plt.xlabel('PCA Dim #1')
plt.ylabel('PCA Dim #2')
plt.show()

In [None]:


# x = pd.Series((x_train[:500].reshape(-1))).value_counts().sort_index()
# x = x[x.index != 255]
# plt.plot(x.index,x.values)

In [None]:
sns.kdeplot(r,color='red')
sns.kdeplot(g,color='green')
sns.kdeplot(b,color='blue')
plt.ylabel('RGB Pixel Value')
plt.ylabel('Frequency')
plt.title('Pixel frequencies for entire dataset')
plt.show()

In [None]:
x,_ = get_trainable_arrays(load_data(TRAIN,get_class='Blueberry'))
r,g,b = x.mean((1,2),where=x <= 253).T

sns.kdeplot(r,color='red')
sns.kdeplot(g,color='green')
sns.kdeplot(b,color='blue')
plt.ylabel('RGB Pixel Value')
plt.ylabel('Frequency')
plt.title('Pixel frequencies for Blueberry')
plt.show()



In [None]:
x,_ = get_trainable_arrays(load_data(TRAIN,get_class='Nectarine'))
r,g,b = x.mean((1,2),where=x <= 253).T

sns.kdeplot(r,color='red')
sns.kdeplot(g,color='green')
sns.kdeplot(b,color='blue')
plt.ylabel('RGB Pixel Value')
plt.ylabel('Frequency')
plt.title('Pixel frequencies for Nectarine')
plt.show()

We can infer from these plots that the individual class pixel frequencies have a much "smoother" distributions than that of the entire dataset. This is expected since the entire dataset contains many fruits that have a variety of different colors.

## Sample TF pipeline

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers


bl_model = tf.keras.Sequential([
    layers.Lambda(lambda x:x/255.),
    layers.Conv2D(16,4,),
    layers.ReLU(),
    layers.GlobalAveragePooling2D(),
    layers.Dense(NUM_CLASSES, activation='softmax')
])
bl_model(tf.zeros_like(img,tf.float32)[None,...])
bl_model.summary()

In [None]:
bl_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['acc'],
)

In [None]:
y_train.shape
img.shape

In [None]:
bl_model.fit(x_train, y_train, 
             epochs=10, 
             validation_split=.3)

## Pretrained mobilenet

In [None]:
from tensorflow.keras.applications import efficientnet

pt_model = efficientnet.EfficientNetB0(False,input_shape=(100,100,3))

In [None]:
pt_model.trainable = False # Freeze lower layers
print(len(pt_model.layers),'layers in total')
for layer in pt_model.layers[:-10:-1]:
    layer.trainable = True # Unfreeze top 10 layers

In [None]:
pt_model(x_train[:1])

In [None]:
pt_model = keras.Sequential([
    layers.Lambda(efficientnet.preprocess_input,),
    pt_model,
    layers.GlobalAvgPool2D(),
    layers.Dropout(.1),
    layers.Dense(NUM_CLASSES, activation='softmax')
])
pt_model(x_train[:1])
pt_model.summary()

In [None]:
pt_model.compile(
    optimizer=keras.optimizers.Adam(1e-4),
    loss='categorical_crossentropy',
    metrics=['acc'],
)

In [None]:
x_train, y_train = get_trainable_arrays(load_data(TRAIN,sample=30))

In [None]:
pt_model.optimizer.lr = 1e-4
pt_model.fit(x_train, y_train, 
             epochs=5, 
             validation_split=.3)
pt_model.optimizer.lr = 1e-5