In [1]:
from functools import lru_cache

import cv2
from wordcloud import WordCloud
import re
from collections import Counter
from bs4 import BeautifulSoup
import html
#import langid
import py3langid as langid
from spellchecker import SpellChecker
import spacy
import nltk
import os
import pandas as pd
import numpy as np

import unidecode
from transformers import BertTokenizer
import torch

from plotly.subplots import make_subplots
from plotly import graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [35]:
import Rakuten_preprocessing as rkt
from importlib import reload 
reload(rkt)

<module 'Rakuten_preprocessing' from 'c:\\Users\\Julien Fournier\\Documents\\GitHub\\RakutenTeam\\notebook\\Rakuten_preprocessing.py'>

In [36]:
folder_path = r"C:\Users\Julien Fournier\Documents\DST\RakutenProject\Data\images\image_train"
save_path = r"C:\Users\Julien Fournier\Documents\DST\RakutenProject\Data\images\image_train_cropped"
rkt.img_resize(folder_path, save_path=save_path, padding=False, suffix='')

In [3]:
data = rkt.Rakuten_txt_import('../data/translated/')

In [4]:
data['text'] = data[['designation_translated', 'description_translated']].apply(lambda row: ' '.join([s for s in row if isinstance(s, str)]), axis=1)
data = data.drop(columns=['designation', 'description', 'designation_translated', 'description_translated'])
data.head(10)

Unnamed: 0,prdtypecode,productid,imageid,language,text
0,10,3804725264,1263597046,de,Olivia : Carnet personnalisé / 150 pages / gri...
1,2280,436067568,1008141237,fr,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2,50,201115110,938777978,fr,Grand Stylet Ergonomique Bleu Gamepad Nintendo...
3,1280,50418756,457047496,en,Peluche Donald - Europe - Disneyland 2000 (Mar...
4,2705,278535884,1077757786,fr,La Guerre Des Tuques Luc a des idées de grande...
5,2280,5862738,393356830,fr,Afrique Contemporaine N° 212 Hiver 2004 - Doss...
6,10,91920807,907794536,de,Christof E : Sur les traces des processus éduc...
7,2522,344240059,999581347,fr,Conquérant Sept Cahier Couverture Polypro 240 ...
8,1280,4239126071,1325918866,fr,Puzzle Scooby-Doo Avec Poster 2x35 Pieces
9,2582,3793572222,1245644185,fr,Tente Pliante V3s5-Pro Pvc Blanc - 3 X 4m50 - ...


In [5]:
prdtype = pd.read_csv('../data/prdtype.csv', index_col='prdtypecode')

In [9]:
sorted_counts = data['prdtypecode'].astype(str).value_counts(ascending=True)
prdlabels = prdtype.loc[sorted_counts.index.astype(int),'prdtypedesignation']

f = go.Figure()
f.add_trace(go.Bar(x=prdtype.loc[sorted_counts.index.astype(int),'prdtypedesignation'],  y=sorted_counts,
                   marker_color=sorted_counts / sorted_counts.sum(), marker=dict(colorscale='Plasma')))

f.update_layout(title='Nombre de produits par categorie',
                width=800, height=800,
                  xaxis_title='Categorie produit',
                  yaxis_title='Nombre de produits',
                  xaxis_tickangle=45,
                  coloraxis=dict(colorscale='Plasma', colorbar_title='Bar Height'))
f.show()

In [10]:
str_len = data['text'].str.split().apply(len)

f = go.Figure()

f.add_trace(go.Box(x=prdtype.loc[data['prdtypecode'].astype(int),'prdtypedesignation'],  y=str_len))

f.update_layout(title='Longueur du texte par categorie',
                width=800, height=800,
                  xaxis_title='Categorie produit',
                  yaxis_title='Nombre de mots',
                  xaxis_tickangle=45,
                  coloraxis=dict(colorscale='Plasma', colorbar_title='Bar Height'))

f.show()

In [12]:
lang_sorted_counts = pd.crosstab(data['language'],data['prdtypecode'], normalize='columns').sort_values('fr', axis=1)

f = go.Figure()
for lang in lang_sorted_counts.index:
  f.add_trace(go.Bar(x=prdtype.loc[lang_sorted_counts.columns.astype(int),'prdtypedesignation'],  y=lang_sorted_counts.loc[lang,:]*100,
                   name=lang))

f.update_layout(title="Langue d'origine par categorie de produit",
                width=800, height=800,
                barmode='stack',
                  xaxis_title='Categorie produit',
                  yaxis_title="Pourcentage d'articles",
                  xaxis_tickangle=45,
                  coloraxis=dict(colorscale='Plasma', colorbar_title='Bar Height'))
f.show()

In [13]:
lang = pd.Series(index=data.index, data='fr')
tokens_spacy = rkt.Rakuten_txt_tokenize(data['text'], lang=lang, method='spacy')

In [6]:
allwordcount, allwords = rkt.Rakuten_txt_wordcount(tokens_spacy)

In [7]:
df_words = pd.DataFrame()
for code in data['prdtypecode'].unique():
    cnt, wrd = rkt.Rakuten_txt_wordcount(tokens_spacy.loc[data['prdtypecode'] == code])
    cnt = cnt / (data['prdtypecode']==code).sum()
    df_words = df_words.join(pd.DataFrame(cnt, index=wrd, columns=['code_' + str(code)]), how='outer')
    df_words = df_words.fillna(0)

In [38]:
nlpdict = spacy.load('fr_core_news_sm')
token_stop = pd.Series([token.is_stop for token in nlpdict(' '.join(df_words.index))], index=df_words.index)

In [48]:
df_words_rel = df_words * df_words.apply(lambda row: row / row.sum(), axis=1)

In [43]:
corr = df_words.corr()

In [49]:
from scipy.cluster.hierarchy import linkage, leaves_list
Z = linkage(df_words.corr(), 'ward')
order = leaves_list(Z)
px.imshow(df_words.corr().iloc[order, order])

In [5]:
tokens_spacy = pd.read_csv('tokens_spacy.csv')