In [None]:
import os
from collections import Counter
import re

from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import nltk
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA


sns.set(font_scale=1.7)

In [None]:
for dirname, _, filenames in os.walk('../data/raw/'):
    if len(filenames) == 1:
        df = pd.read_csv(os.path.join(dirname, filenames[0]))

In [None]:
col_to_drop = [
    'uniq_id',
    'crawl_timestamp',
    'product_url',
    'pid',
    'discounted_price',
    'is_FK_Advantage_product',
    'product_rating',
    'overall_rating',
    'product_specifications',
    'brand',
    
]
df.drop(columns=col_to_drop, inplace=True)

In [None]:
df

In [None]:
def extract_level(tree_str, level=-1, strict=False):
    """return a specific level from product_category_tree.
    tips: specify a negative index to access latest part of the tree.
    """
    tree_str = eval(tree_str)[0]
    levels = tree_str.split('>>')
    levels = list(map(lambda x: x.strip(), levels))
    if strict:
        # if strict raise IndexError if the level does'nt exist.
        return levels[level]
    else:
        # otherwise return None if the level does'nt exist.
        try:
            return levels[level]
        except IndexError:
            return None

In [None]:
df['lev_1'] = df['product_category_tree'].apply(extract_level, level=1)

In [None]:
df['lev_1'].unique()

In [None]:
df['product_category_tree'].apply(extract_level, level=2).unique()

In [None]:
df['label'] = df['product_category_tree'].apply(extract_level, level=1)

### Essais

In [None]:
df['word tk'] = df['description'].apply(nltk.word_tokenize)

In [None]:
sentences = df['description'].to_list()

In [None]:
sentences

###  normalisation

In [None]:
punctuation = re.compile(r'[.,;!?:()/&-]+')
sentences = list(map(str.lower, sentences))
sentences = [re.sub(punctuation, ' ', x) for x in sentences]

# remove numeric data
numeric = re.compile(r'\d+')
sentences = [re.sub(numeric, '', x) for x in sentences]

### tokenisation

In [None]:
tokens = list(map(nltk.word_tokenize, sentences))

In [None]:
tokens

### filter stop words

In [None]:
filtered = list()
for sentence in tokens:
    sentence_ = list()
    for word in sentence:
        if word not in nltk.corpus.stopwords.words('english'):
            sentence_.append(word)
    filtered.append(sentence_)

In [None]:
tokens = filtered

### stemming

In [None]:
stemmer = nltk.stem.PorterStemmer()
token_stem = list()
for desc in tokens:
    desc_ = list()
    for token in desc:
        desc_.append(stemmer.stem(token))
    token_stem.append(desc_)
tokens = token_stem

### bag of words

In [None]:
bag_of_words = []

for tk in tokens:
    bag_of_words.append(Counter(tk))


In [None]:
df_bofw = pd.DataFrame.from_records(bag_of_words)
df_bofw.fillna(0, inplace=True)
df_bofw

In [None]:
df_bofw.sum(axis=0)[df_bofw.sum(axis=0) < 5]

In [None]:
sns.distplot(df_bofw.sum(axis=0))

In [None]:
sns.distplot(df_bofw.sum(axis=0) / df_bofw.sum(axis=0).sum())

In [None]:
pca_res = PCA(n_components=50).fit_transform(df_bofw)

tsne = TSNE(n_components=2, perplexity=30, learning_rate=10, n_iter=2500)
tsne_res = tsne.fit_transform(pca_res)

tsne_res = pd.DataFrame(tsne_res)
tsne_res['label'] = df['lev_1']
tsne_res['product_name'] = df['product_name']


In [None]:
px.scatter(data_frame=tsne_res, x=0, y=1, color='label', hover_name='product_name')

A ce point il a des mots tokens trop fréquents et d'autre trop peu.

In [None]:
# as frequencies
df_bofw = df_bofw.div(df_bofw.sum(axis=1), axis=0)

In [None]:
df_bofw

In [None]:
df_bofw.describe().loc['max', :].describe()

In [None]:
df.iloc[0].description

In [None]:
df_bofw.sum(axis=0)

In [None]:
sns.distplot(df_bofw.sum(axis=0))

In [None]:
too_frequent = df_bofw.sum(axis=0)[df_bofw.sum(axis=0) > 10].index
too_rare = df_bofw.sum(axis=0)[df_bofw.sum(axis=0) < 2e-2].index

In [None]:
for tk in list(too_frequent) + list(too_rare):
    df_bofw.drop(tk, axis=1, inplace=True)


In [None]:
sns.distplot(df_bofw.sum(axis=0))

In [None]:
df_bofw.shape

In [None]:
pca_res = PCA(n_components=200).fit_transform(df_bofw)

tsne = TSNE(n_components=2, perplexity=30, learning_rate=10, n_iter=2500)
tsne_res = tsne.fit_transform(pca_res)

tsne_res = pd.DataFrame(tsne_res)
tsne_res['label'] = df['lev_1']
tsne_res['product_name'] = df['product_name']

px.scatter(data_frame=tsne_res, x=0, y=1, color='label', hover_name='product_name')

In [None]:
centers = tsne_res.groupby('label').mean()
centers.reset_index(drop=False, inplace=True)

_, ax = plt.subplots(1, figsize=(24, 18))
palette = sns.color_palette(None, centers.shape[0])
for i, center in enumerate(centers['label']):
    if tsne_res.groupby('label').count().loc[center, 0] > 10:
        ax.scatter(x=tsne_res.loc[tsne_res['label'] == center, 0],
           y=tsne_res.loc[tsne_res['label'] == center, 1],
           color=palette[i]
          )
        
        ax.annotate(center, centers.set_index('label').loc[center, :].values,
                    color=palette[i]
                   )
# plt.axis('off')
plt.show()

In [None]:
# pca_res = PCA(n_components=50).fit_transform(df_bofw)

tsne = TSNE(n_components=2, perplexity=30, learning_rate=10, n_iter=2500)
tsne_res = tsne.fit_transform(df_bofw)

tsne_res = pd.DataFrame(tsne_res)
tsne_res['label'] = df['lev_1']
tsne_res['product_name'] = df['product_name']

centers = tsne_res.groupby('label').mean()
centers.reset_index(drop=False, inplace=True)

_, ax = plt.subplots(1, figsize=(24, 18))
palette = sns.color_palette(None, centers.shape[0])
for i, center in enumerate(centers['label']):
    if tsne_res.groupby('label').count().loc[center, 0] > 10:
        ax.scatter(x=tsne_res.loc[tsne_res['label'] == center, 0],
           y=tsne_res.loc[tsne_res['label'] == center, 1],
           color=palette[i]
          )
        
        ax.annotate(center, centers.set_index('label').loc[center, :].values,
                    color=palette[i]
                   )

In [None]:
pca_res = PCA(n_components=2).fit_transform(df_bofw)

pca_res = pd.DataFrame(pca_res)
pca_res['label'] = df.label

centers = pca_res.groupby('label').mean()
centers.reset_index(drop=False, inplace=True)

_, ax = plt.subplots(1, figsize=(24, 18))
palette = sns.color_palette(None, centers.shape[0])
for i, center in enumerate(centers['label']):
    if pca_res.groupby('label').count().loc[center, 0] > 10:
        ax.scatter(x=tsne_res.loc[tsne_res['label'] == center, 0],
           y=pca_res.loc[tsne_res['label'] == center, 1],
           color=palette[i]
          )
        
        ax.annotate(center, centers.set_index('label').loc[center, :].values,
                    color=palette[i]
                   )

In [None]:
import scikitplot as skplt

In [None]:
pca = PCA()
pca.fit(df_bofw)

fit, ax = plt.subplots(1, figsize=(12, 8))
skplt.decomposition.plot_pca_component_variance(pca, ax=ax)

plt.show()

In [None]:
db = DBSCAN(eps=.4, min_samples=5)
db.fit(tsne_res.iloc[:, :-3])

labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

In [None]:
tsne_res['db_group'] = db.labels_.astype('str')

In [None]:
px.scatter(data_frame=tsne_res, x=0, y=1, color='db_group', hover_name='product_name')

### Visualisation dans TensorBoard

In [None]:
from PIL import Image
import cv2

df['path'] = df['image'].apply(lambda x: os.path.join('../data/raw/Images/', x))


In [None]:
 def images_to_sprite(data):
        """Creates the sprite image along with any necessary padding

        Args:
          data: NxHxW[x3] tensor containing the images.

        Returns:
          data: Properly shaped HxWx3 image with any necessary padding.
        """
        if len(data.shape) == 3:
            data = np.tile(data[...,np.newaxis], (1,1,1,3))
        data = data.astype(np.float32)
        min = np.min(data.reshape((data.shape[0], -1)), axis=1)
        data = (data.transpose(1,2,3,0) - min).transpose(3,0,1,2)
        max = np.max(data.reshape((data.shape[0], -1)), axis=1)
        data = (data.transpose(1,2,3,0) / max).transpose(3,0,1,2)
        # Inverting the colors seems to look better for MNIST
        #data = 1 - data

        n = int(np.ceil(np.sqrt(data.shape[0])))
        padding = ((0, n ** 2 - data.shape[0]), (0, 0),
                (0, 0)) + ((0, 0),) * (data.ndim - 3)
        data = np.pad(data, padding, mode='constant',
                constant_values=0)
        # Tile the individual thumbnails into an image.
        data = data.reshape((n, n) + data.shape[1:]).transpose((0, 2, 1, 3)
                + tuple(range(4, data.ndim + 1)))
        data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
        data = (data * 255).astype(np.uint8)
        return data

In [None]:
from tensorboard.plugins import projector
import tensorflow.compat.v1 as tf

tf.disable_v2_behavior()

# transform bag of words in tensorflow's data
tf_data = tf.Variable(df_bofw.values, name='features')

LOG_DIR = '../reports/tf/sessions/text/'

for dirname, _, filenames in os.walk(LOG_DIR):
    for filename in filenames:
        os.remove(os.path.join(dirname, filename))

metadata = 'df_labels.tsv'


# prepare sprites
images = np.array([np.array(Image.open(x).resize((200, 200))) for x in df['path']])
sprite = images_to_sprite(images)
cv2.imwrite(os.path.join(LOG_DIR, 'sprite_4_classes.png'), sprite)

# save the metadata file
df['db_group'] = tsne_res['db_group']
# df['product_name'].to_csv(os.path.join(LOG_DIR, metadata), index=False, header=False)

# save complementary metadate in another file
# can't load metadata file with header at starting
df[['product_name', 'label', 'db_group']].to_csv(os.path.join(LOG_DIR, metadata), 
                                                 index=False, header=True, sep='\t')


In [None]:
# old style from tf <= 2
# fake a session to create checkpoint
# finally add projector to the session writer.
with tf.Session() as sess:
    saver = tf.train.Saver([tf_data])
    sess.run(tf_data.initializer)
    saver.save(sess, os.path.join(LOG_DIR, 'tf_data.ckpt'))
    config = projector.ProjectorConfig()
    
    embedding = config.embeddings.add()
    embedding.tensor_name = tf_data.name
    
    embedding.sprite.image_path = 'sprite_4_classes.png'
    embedding.sprite.single_image_dim.extend([images.shape[1], images.shape[1]])

    embedding.metadata_path = metadata
    
    projector.visualize_embeddings(tf.summary.FileWriter(LOG_DIR), 
                                   config)

### word to vec 

In [None]:
model = Word2Vec(tokens, min_count=1, size=50, workers=3, window=3, sg=1)

In [None]:
tokens[10]

In [None]:
model.wv.similarity('key', 'featur')

In [None]:
model.wv.similarity('watch', 'analog')

In [None]:
model.wv.similarity('key', 'watch')

In [None]:
model.wv.similarity('key', 'analog')

In [None]:
model.wv.get_vector('key')

### LDA et NMF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF

no_features = 1000

tfid_vectorizer = TfidfVectorizer(max_df=0.95,
                                  min_df=2,
                                  max_features=no_features,
                                  stop_words='english')
tfid = tfid_vectorizer.fit_transform(sentences)
tfid_feature_names = tfid_vectorizer.get_feature_names()

no_topics = 50

nmf = NMF(n_components=no_topics, alpha=.1, l1_ratio=.5, init='nndsvd')
nmf.fit(tfid)

no_to_words = 10

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(nmf, tfid_feature_names, no_to_words)