In [None]:
from collections import Counter
import os
import random
import re

import numpy as np
import nltk
import cv2
from PIL import Image, ImageOps, ImageFilter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans
import scikitplot as skplt


import plotly.express as px


sns.set(font_scale=1.6)

Image.MAX_IMAGE_PIXELS = 93680329


In [None]:
for dirname, _, filenames in os.walk('../data/raw/'):
    if len(filenames) == 1:
        df = pd.read_csv(os.path.join(dirname, filenames[0]))

In [None]:
df['path'] = df['image'].apply(lambda x: os.path.join('../data/raw/Images/', x))

In [None]:
col_to_drop = [
    'uniq_id',
    'crawl_timestamp',
    'product_url',
    'pid',
    'discounted_price',
    'is_FK_Advantage_product',
    'product_rating',
    'overall_rating',
    'product_specifications',
    'brand',
    
]
df.drop(columns=col_to_drop, inplace=True)

In [None]:
def extract_level(tree_str, level=-1, strict=False):
    """return a specific level from product_category_tree.
    tips: specify a negative index to access latest part of the tree.
    """
    tree_str = eval(tree_str)[0]
    levels = tree_str.split('>>')
    levels = list(map(lambda x: x.strip(), levels))
    if strict:
        return levels[level]
    else:
        try:
            return levels[level]
        except IndexError:
            return None

In [None]:
df['label'] = df['product_category_tree'].apply(extract_level, level=1)
df['label']

In [None]:
df['product_category_tree'].apply(extract_level, level=0).value_counts()

In [None]:
import keras
from keras.applications.resnet50 import ResNet50 

In [None]:
base_model = ResNet50(weights='imagenet')
model = keras.Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)
image_224 = np.array([np.array(Image.open(x).resize((224, 224))) for x in df['path']])
features = model.predict(image_224)

In [None]:
sentences = df['description'].to_list()

punctuation = re.compile(r'[.,;!?:()/&-]+')
sentences = list(map(str.lower, sentences))
sentences = [re.sub(punctuation, ' ', x) for x in sentences]

# remove numeric data
numeric = re.compile(r'\d+')
sentences = [re.sub(numeric, '', x) for x in sentences]
tokens = list(map(nltk.word_tokenize, sentences))

filtered = list()
for sentence in tokens:
    sentence_ = list()
    for word in sentence:
        if word not in nltk.corpus.stopwords.words('english'):
            sentence_.append(word)
    filtered.append(sentence_)
tokens = filtered

stemmer = nltk.stem.PorterStemmer()
token_stem = list()
for desc in tokens:
    desc_ = list()
    for token in desc:
        desc_.append(stemmer.stem(token))
    token_stem.append(desc_)
tokens = token_stem

bag_of_words = []

for tk in tokens:
    bag_of_words.append(Counter(tk))

df_bofw = pd.DataFrame.from_records(bag_of_words)
df_bofw.fillna(0, inplace=True)

# as frequencies
df_bofw = df_bofw.div(df_bofw.sum(axis=1), axis=0)

too_frequent = df_bofw.sum(axis=0)[df_bofw.sum(axis=0) > 10].index
too_rare = df_bofw.sum(axis=0)[df_bofw.sum(axis=0) < 2e-2].index

for tk in list(too_frequent) + list(too_rare):
    df_bofw.drop(tk, axis=1, inplace=True)
    
print(df_bofw.shape)

In [None]:
full_features = np.concatenate((features, df_bofw), axis=1)

In [None]:
full_features.shape

In [None]:
pca = PCA()
pca.fit(full_features)
fig, ax = plt.subplots(1, figsize=(12, 8))
skplt.decomposition.plot_pca_component_variance(pca, ax=ax)
plt.show()

In [None]:
pca_res = PCA(n_components=1000).fit_transform(full_features)
tsne_res = TSNE(n_components=2, perplexity=30, 
                learning_rate=10, n_iter=2500).fit_transform(pca_res)

tsne_res = pd.DataFrame(tsne_res)
tsne_res['label'] = df['label']


In [None]:
centers = tsne_res.groupby('label').mean()
centers.reset_index(drop=False, inplace=True)

_, ax = plt.subplots(1, figsize=(24, 18))
palette = sns.color_palette(None, centers.shape[0])
for i, center in enumerate(centers['label']):
    if tsne_res.groupby('label').count().loc[center, 0] > 10:
        ax.scatter(x=tsne_res.loc[tsne_res['label'] == center, 0],
           y=tsne_res.loc[tsne_res['label'] == center, 1],
           color=palette[i]
          )
        
        ax.annotate(center, centers.set_index('label').loc[center, :].values,
                    color=palette[i]
                   )
# plt.axis('off')
plt.show()

###  Exemple basique pour la présentation finale 

In [None]:
test = "Ay, madam, it is common."
test_2 = """ Seems, madam ? Nay, it is. I know not ‘seems’.
   ‘Tis not alone my inky cloak, good mother, 
   Nor customary suits of solemn black,
   Nor windy suspiration of forc’d breath,
   No, nor the fruitful river in the eye,
   Nor the dejected haviour of the visage, 
   Together with all forms, moods, shapes of grief,
   That can denote me truly. These indeed seem,
   For they are actions that a man might play ;
   But I have that within which passes show,
   These but the trappings and the suits of woe."""

In [None]:
test = test.lower()
test_2 = test_2.lower()

test, test_2 = test.replace('\n', ' '), test_2.replace('\n', ' ')

In [None]:
nltk.tokenize.word_tokenize(test)
nltk.tokenize.word_tokenize(test_2)

In [None]:
punctuation = re.compile(r'[.,;!?:()/&-\\‘\\’]+')

In [None]:
test = re.sub(punctuation, ' ', test)
test_2 = re.sub(punctuation, ' ', test_2)

In [None]:
test_2

In [None]:
tokens = nltk.tokenize.word_tokenize(test.lower())
tokens_2 = nltk.tokenize.word_tokenize(test_2.lower())

In [None]:
tokens

In [None]:
tokens_2

In [None]:
filtered = list()
for tk in tokens:
    if tk not in nltk.corpus.stopwords.words('english'):
        filtered.append(tk)
tokens = filtered

In [None]:
filtered_2 = list()
for tk in tokens_2:
    if tk not in nltk.corpus.stopwords.words('english'):
        filtered_2.append(tk)
tokens_2 = filtered_2

In [None]:
tokens

In [None]:
tokens_2

In [None]:
tokens_2 = [nltk.stem.PorterStemmer().stem(x) for x in tokens_2]

In [None]:
tokens_2

In [None]:
bofw = pd.DataFrame.from_records([Counter(tokens_2)])

In [None]:
bofw

In [None]:
nltk.stem.PorterStemmer().stem("engineering")