Place de marché
==============

![logo](../reports/figures/logo.png)


### Votre mission
Votre mission est de **réaliser une première étude de faisabilité d'un moteur de classification** d'articles basé sur une image et une description pour l'automatisation de l'attribution de la catégorie de l'article.

Pour ce faire, vous allez **évaluer la possibilité d'extraire des données depuis l'API Amazon** en **prenant connaissance de la documentation** et en **écrivant la requête** qui vous permettrait d'extraire des données supplémentaires. Vous vous assurerez ainsi que vous pourrez bien disposer de plus de données et diversifier les sources de données pour éviter les biais pour votre moteur de classification.

Ensuite, vous **analyserez le jeu de données** déjà constitué en **réalisant un prétraitement** des images et des descriptions des produits, une **réduction de dimension**, puis un **clustering**. Les résultats du clustering seront présentés sous la forme d’une représentation en deux dimensions à déterminer, qui ’illustrera le fait que les caractéristiques extraites permettent de regrouper des produits de même catégorie.

La représentation graphique vous aidera à convaincre Linda que cette approche de modélisation permettra bien de regrouper des produits de même catégorie.

### Contraintes

Linda vous a communiqué les contraintes suivantes :

   * Limiter le nombre d’articles pris par l’API (par exemple : 1000 lignes) et filtrer sur un unique type d’article (par exemple un type d’article peu présent dans votre échantillon de données actuelles).
   * Afin d’extraire les features, mettre en œuvre a minima un algorithme de type SIFT / ORB / SURF.
   * Un algorithme de type CNN Transfer Learning peut éventuellement être utilisé en complément, s’il peut apporter un éclairage supplémentaire à la démonstration.

In [None]:
import os
import random

import numpy as np
import cv2
from PIL import Image, ImageOps, ImageFilter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans, MiniBatchKMeans
from sklearn import metrics
    
import plotly.express as px


sns.set(font_scale=1.6)

Image.MAX_IMAGE_PIXELS = 93680329


In [None]:
def histogram(image, figsize=(12, 8), kde=False):
    image = np.array(image)
    if len(image.shape) > 2:
        # RGB mode
        fig, axes = plt.subplots(3, 1, figsize=figsize)
        for channel, color, ax in  zip(range(3), ['r', 'g', 'b'], axes):
            sns.distplot(image[:, :, channel].flatten(), 
                         kde=kde, color=color, ax=ax)
    else:
        # Gray
        sns.distplot(image.flatten(), kde=kde)

### Chargement des descriptions

In [None]:
for dirname, _, filenames in os.walk('../data/raw/'):
    if len(filenames) == 1:
        df = pd.read_csv(os.path.join(dirname, filenames[0]))

In [None]:
df['path'] = df['image'].apply(lambda x: os.path.join('../data/raw/Images/', x))

In [None]:
col_to_drop = [
    'uniq_id',
    'crawl_timestamp',
    'product_url',
    'pid',
    'discounted_price',
    'is_FK_Advantage_product',
    'product_rating',
    'overall_rating',
    'product_specifications',
    'brand',
    
]
df.drop(columns=col_to_drop, inplace=True)

In [None]:
def extract_level(tree_str, level=-1, strict=False):
    """return a specific level from product_category_tree.
    tips: specify a negative index to access latest part of the tree.
    """
    tree_str = eval(tree_str)[0]
    levels = tree_str.split('>>')
    levels = list(map(lambda x: x.strip(), levels))
    if strict:
        return levels[level]
    else:
        try:
            return levels[level]
        except IndexError:
            return None

On récupère le premier niveau de l'arbre des catégories comme label

In [None]:
df['label'] = df['product_category_tree'].apply(extract_level, level=1)

In [None]:
df['label']

## ORB

In [None]:
def scale_down(image, factor=5):
    width, height = image.size
    target_width, target_height = width // factor, height // factor
    return image.resize((target_width, target_height))

In [None]:
images = [scale_down(Image.open(x), factor=5) for x in df['path']]

In [None]:
# size = 10
# f, axes = plt.subplots(size, size, figsize=(12, 12))
# for ax, im in zip(axes.flatten(), random.sample(images, size ** 2)):
#     ax.imshow(im, cmap='gray', aspect='auto')
#     ax.set_xticks([])
#     ax.set_yticks([])  # to hide tick values on X and Y axis

In [None]:
extractor = cv2.ORB_create()

def features(image, extractor):
    assert type(image) == np.ndarray
    keypoints, descriptors = extractor.detectAndCompute(image, None)
    return keypoints, descriptors

In [None]:
images[1]

In [None]:
histogram(images[1], kde=True)

In [None]:
kp, desc = features(np.array(images[1]), extractor)

In [None]:
Image.fromarray(cv2.drawKeypoints(np.array(images[1]), kp, None))

In [None]:
ImageOps.equalize(images[1])

In [None]:
histogram(ImageOps.equalize(images[1]), kde=True)

In [None]:
kp, desc = features(np.array(ImageOps.equalize(images[1])), extractor)

In [None]:
Image.fromarray(cv2.drawKeypoints(np.array(images[1]), kp, None))

In [None]:
kp, desc = features(np.array(images[1].filter(ImageFilter.BoxBlur(1))), extractor)

In [None]:
Image.fromarray(cv2.drawKeypoints(np.array(images[1]), kp, None))

In [None]:
def preprocess(image):
    image = ImageOps.equalize(image)
    image = image.filter(ImageFilter.BoxBlur(1))
    return image

In [None]:
preprocess(images[30])

In [None]:
kp, desc = features(np.array(preprocess(images[1])), extractor)

In [None]:
Image.fromarray(cv2.drawKeypoints(np.array(preprocess(images[1])), kp, None))

In [None]:
index = 5

kp1, desc1 = features(np.array(images[index]), extractor)
Image.fromarray(cv2.drawKeypoints(np.array(images[index]), kp1, None))

In [None]:
index = 11

kp2, desc2 = features(np.array(preprocess(images[index])), extractor)
Image.fromarray(cv2.drawKeypoints(np.array(preprocess(images[index])), kp2, None))

In [None]:
bf = cv2.BFMatcher_create(cv2.NORM_HAMMING, crossCheck=True)

matches = bf.match(desc1, desc2)
marches = sorted(matches, key=lambda x: x.distance)

Image.fromarray(cv2.drawMatches(np.array(images[5]), kp1,
                np.array(images[11]), kp2, matches[:10], flags=2, outImg=None))

### Premier essai : peut-on séparer les montres des tasses à café?

In [None]:
coffee_mugs = df[df['label'] == 'Coffee Mugs']
coffee_mugs = [scale_down(Image.open(x)) for x in coffee_mugs['path'].to_list()]

In [None]:
watches = df[df['label'] == 'Wrist Watches']
watches = [scale_down(Image.open(x)) for x in watches['path'].to_list()]

In [None]:
coffee_mugs[0]

In [None]:
watches[0]

In [None]:
kp1, desc1 = features(np.array(watches[0]), extractor)
Image.fromarray(cv2.drawKeypoints(np.array(watches[0]), kp1, None))

In [None]:
descriptor_list = list()
for im in watches + coffee_mugs:
    im = im.convert('L')
    kp, desc = features(np.array(im), extractor)
    if (desc is not None):
        descriptor_list.append(desc)

In [None]:
len(descriptor_list)

In [None]:
len(watches)

In [None]:
descriptor_list = np.concatenate(descriptor_list)

In [None]:
descriptor_list.shape

In [None]:
descriptor_list

In [None]:
kmeans = MiniBatchKMeans(n_clusters=800, init_size=3000)
kmeans.fit(descriptor_list)

In [None]:
pca = PCA(n_components=3)
pca_res = pca.fit_transform(descriptor_list)
pca_res = pd.DataFrame(pca_res)
pca_res['kmeans'] = kmeans.labels_

fig, ax = plt.subplots(1, figsize=(12, 8))
# px.scatter_3d(data_frame=pca_res, x=0, y=1, z=2, color='kmeans')
sns.scatterplot(data=pca_res, x=0, y=1, hue='kmeans', ax=ax)
ax.legend_.remove()
plt.show()


In [None]:
hist, bin_edges = np.histogram(descriptor_list, bins=800)

In [None]:
len(hist)

In [None]:
plt.bar(bin_edges[:-1], hist)

In [None]:
from collections import Counter

def build_histogram(descriptor, kmeans):
    labels = kmeans.predict(descriptor)
#     centers = kmeans.cluster_centers_
    return Counter(labels)

In [None]:
preprocessed_images = []
for image in watches + coffee_mugs:
    image = image.convert('L')
    key, desc = features(np.array(image), extractor)
    if (desc is not None):
        histogram = build_histogram(desc, kmeans)
        preprocessed_images.append(histogram)

In [None]:
bofvw = pd.DataFrame.from_records(preprocessed_images)

In [None]:
bofvw.fillna(0, inplace=True)

In [None]:
bofvw

In [None]:
pca = PCA(n_components=20)
pca_50 = pca.fit_transform(bofvw)

In [None]:
pca_50.shape

In [None]:
tsne = TSNE(n_components=2)
tsne_res = tsne.fit_transform(bofvw)

In [None]:
tsne_res = pd.DataFrame(tsne_res)
tsne_res['label']= 'watch'
tsne_res.loc[149:, 'label'] = 'Mugs'

fig, ax = plt.subplots(1, figsize=(12, 8))
# px.scatter_3d(data_frame=pca_res, x=0, y=1, z=2, color='kmeans')
sns.scatterplot(data=tsne_res, x=0, y=1, hue='label', ax=ax)
# ax.legend_.remove()
plt.show()

**warning** A ce stade rien ne va plus...

Les features extraites sont communes aux deux catégories....

D'où vient l'erreur? 

   * Algorithme très bon mais pas adapté à ce genre de tâches:
       * Bon pour créer des photos panoramiques
       * Bon pour détecter le même objet dans des conditions différentes

In [None]:
items = dict()
for dirname, _, filenames in os.walk('../data/external/example_lafayette/'):
    if filenames:
        key = dirname.split('/')[-1]
        key = key.replace('\\', '-')
        items[key] = [Image.open(os.path.join(dirname, x)) for x in filenames]
        items[key] = [scale_down(im, factor=2) for im in items[key]] 

In [None]:
items['watches-1'][1]

In [None]:
kp1, desc1 = features(np.array(items['watches-1'][0]), extractor)
kp2, desc2 = features(np.array(items['watches-1'][1]), extractor)

In [None]:
Image.fromarray(cv2.drawKeypoints(np.array(items['watches-1'][0]), kp1, None))

In [None]:
Image.fromarray(cv2.drawKeypoints(np.array(items['watches-1'][1]), kp2, None))

In [None]:
bf = cv2.BFMatcher_create(cv2.NORM_HAMMING, crossCheck=True)

matches = bf.match(desc1, desc2)
marches = sorted(matches, key=lambda x: x.distance)

Image.fromarray(cv2.drawMatches(np.array(items['watches-1'][0]), kp1,
                                np.array(items['watches-1'][1]), kp2, matches[:30], flags=2, outImg=None))

In [None]:
# FLANN parameters
FLANN_INDEX_KDTREE = 0
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)   # or pass empty dictionary

flann = cv2.FlannBasedMatcher(index_params, search_params)

matches = flann.knnMatch(np.float32(desc1), np.float32(desc2), k=3)

# Need to draw only good matches, so create a mask
matchesMask = [[0,0] for i in range(len(matches))]

# ratio test as per Lowe's paper
for i, x in enumerate(matches):
    m, n, o = x
    if m.distance < 0.7*n.distance:
        matchesMask[i]=[1,0]

draw_params = dict(matchColor = (0,255,0),
                   singlePointColor = (255,0,0),
                   matchesMask = matchesMask,
                   flags=0)

Image.fromarray(cv2.drawMatchesKnn(np.array(items['watches-1'][0]), kp1,
                                   np.array(items['watches-1'][1]), kp2,
                                   matches, None, **draw_params))



In [None]:
kp1, desc1 = features(np.array(watches[0]), extractor)
Image.fromarray(cv2.drawKeypoints(np.array(watches[0]), kp1, None))

In [None]:
kp2, desc2 = features(np.array(watches[1]), extractor)
Image.fromarray(cv2.drawKeypoints(np.array(watches[1]), kp2, None))

In [None]:
# FLANN parameters
FLANN_INDEX_KDTREE = 0
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)   # or pass empty dictionary

flann = cv2.FlannBasedMatcher(index_params, search_params)

matches = flann.knnMatch(np.float32(desc1), np.float32(desc2), k=3)

# Need to draw only good matches, so create a mask
matchesMask = [[0,0] for i in range(len(matches))]

# ratio test as per Lowe's paper
for i, x in enumerate(matches):
    m, n, o = x
    if m.distance < 0.7*n.distance:
        matchesMask[i]=[1,0]

draw_params = dict(matchColor = (0,255,0),
                   singlePointColor = (255,0,0),
                   matchesMask = matchesMask,
                   flags=0)

Image.fromarray(cv2.drawMatchesKnn(np.array(watches[0]), kp1,
                                   np.array(watches[1]), kp2,
                                   matches, None, **draw_params))



In [None]:
kp1, desc1 = features(np.array(watches[0]), extractor)
Image.fromarray(cv2.drawKeypoints(np.array(watches[0]), kp1, None))

In [None]:
kp2, desc2 = features(np.array(coffee_mugs[0]), extractor)
Image.fromarray(cv2.drawKeypoints(np.array(coffee_mugs[0]), kp2, None))

In [None]:
# FLANN parameters
FLANN_INDEX_KDTREE = 0
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)   # or pass empty dictionary

flann = cv2.FlannBasedMatcher(index_params, search_params)

matches = flann.knnMatch(np.float32(desc1), np.float32(desc2), k=3)

# Need to draw only good matches, so create a mask
matchesMask = [[0,0] for i in range(len(matches))]

# ratio test as per Lowe's paper
for i, x in enumerate(matches):
    m, n, o = x
    if m.distance < 0.7*n.distance:
        matchesMask[i]=[1,0]

draw_params = dict(matchColor = (0,255,0),
                   singlePointColor = (255,0,0),
                   matchesMask = matchesMask,
                   flags=0)

Image.fromarray(cv2.drawMatchesKnn(np.array(watches[0]), kp1,
                                   np.array(coffee_mugs[0]), kp2,
                                   matches, None, **draw_params))



In [None]:
kp1, desc1 = features(np.array(coffee_mugs[0]), extractor)
Image.fromarray(cv2.drawKeypoints(np.array(coffee_mugs[0]), kp1, None))

In [None]:
kp2, desc2 = features(np.array(coffee_mugs[1]), extractor)
Image.fromarray(cv2.drawKeypoints(np.array(coffee_mugs[1]), kp2, None))

In [None]:
# FLANN parameters
FLANN_INDEX_KDTREE = 0
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)   # or pass empty dictionary

flann = cv2.FlannBasedMatcher(index_params, search_params)

matches = flann.knnMatch(np.float32(desc1), np.float32(desc2), k=3)

# Need to draw only good matches, so create a mask
matchesMask = [[0,0] for i in range(len(matches))]

# ratio test as per Lowe's paper
for i, x in enumerate(matches):
    m, n, o = x
    if m.distance < 0.7*n.distance:
        matchesMask[i]=[1,0]

draw_params = dict(matchColor = (0,255,0),
                   singlePointColor = (255,0,0),
                   matchesMask = matchesMask,
                   flags=0)

Image.fromarray(cv2.drawMatchesKnn(np.array(coffee_mugs[0]), kp1,
                                   np.array(coffee_mugs[1]), kp2,
                                   matches, None, **draw_params))



In [None]:
Image.fromarray(np.array(coffee_mugs[1]) * 3)

In [None]:
Image.fromarray(np.array(coffee_mugs[0]) * 3)

In [None]:
np.array(watches[0]).mean()

In [None]:
np.array(coffee_mugs[0]).mean()

In [None]:
np.array(images[1]).mean()

In [None]:
(np.array(images[1]) * 3).mean()

In [None]:
Image.fromarray(np.array(images[1]) * 3)