# Classificação de produtos pela categoria.

Neste notebook exploramos diferentes estratégias de ML/DL para classificar produtos em suas categorias.

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.base import clone

from gensim.models.fasttext import FastText

# Módulos locais

from utils.text_clean import clean_text
from utils.word2vec import Word2VecModel
from utils.doc2vec import Doc2VecModel

from utils.clfs import Clfs

In [3]:
headers = ["p_id", "p_title", "vendor_id", "cluster_id", "cluster_title", "cat_id", "cat_title"]
df = pd.read_csv('datasets/pricerunner_aggregate.csv', header=0, names = headers)
#df = df[df.cat_id == 2612]
df.head()

Unnamed: 0,p_id,p_title,vendor_id,cluster_id,cluster_title,cat_id,cat_title
0,2,apple iphone 8 plus 64 gb spacegrau,2,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
1,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
2,4,apple iphone 8 plus 64gb space grey,4,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
3,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones
4,6,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,6,1,Apple iPhone 8 Plus 64GB,2612,Mobile Phones


# Construção de atributos (Features)

In [4]:
df = df[ (df.p_title.notna()) & (df.p_title.notnull()) ]

In [5]:
# Atribuindo a cada produto da base um inteiro com a quantidade de nomes
# diferentes estão associados ao seu ID (incluindo ele).
counts = df.cluster_id.value_counts()
freqs = [ counts[cid] for cid in df.cluster_id ]
df["freqs"] = freqs

In [6]:
feat_tokens = clean_text(df.p_title.values)

In [7]:
params = {
    "vector_size": 100,
    "sg": 1,
    "epochs": 5
}

w2v = Word2VecModel(feat_tokens, params=params)

In [8]:
feats = w2v.transform(feat_tokens)
feats.shape

# Classificação de produtos por categoria.

Verificando o desempenho dos classificadores por categoria.

In [None]:
r = np.random
seed = r.randint(0, 2147483647 * 2)

"""
classifiers = {
    "RandomForestClassifier": RandomForestClassifier(n_jobs=10, random_state=seed),
    "LogisticRegression": LogisticRegression(max_iter=400, multi_class='multinomial', n_jobs=10, random_state=seed),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=seed),
    "KNeighborsClassifier": KNeighborsClassifier(n_jobs=10),
    "SVC": SVC(random_state=seed)
}
"""
classifiers = {
    "RandomForestClassifier": RandomForestClassifier(n_jobs=10, random_state=seed),
    "LogisticRegression": LogisticRegression(max_iter=400, multi_class='multinomial', n_jobs=10, random_state=seed),
    "KNeighborsClassifier": KNeighborsClassifier(n_jobs=10)
}

In [None]:

r = np.random
seed = r.randint(0, 2147483647 * 2)

# Verificando a taxa de acerto pela quantidade de classes.
n_fold = 5
ids = list(set(df[(df.freqs >= n_fold) & (df.cat_id == 2612)].cluster_id))
len(ids)
limit = 50
for i in range(3, limit):
    # Escolhendo um sample de produtos aleatório.
    clusters_ids = np.random.choice(ids, i)
    set_sample = df.cluster_id.isin(clusters_ids)
    target = df[set_sample]["cluster_id"].values
    vecs = feats[set_sample]
    
    clf = SVC()
    print("Number of classes: ", i, "F1: ", np.mean(cross_val_score(clf, vecs, target, cv=n_fold, n_jobs=10, scoring='f1_macro')))
    
    


    


In [None]:
df[df.cluster_id.isin([45222])]

In [None]:
set(df.cat_id)