# Classificação de produtos pela categoria.

Neste notebook exploramos diferentes estratégias de ML/DL para classificar produtos em suas categorias.

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.base import clone

from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText

import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
headers = ["p_id", "p_title", "vendor_id", "cluster_id", "cluster_title", "cat_id", "cat_title"]
df = pd.read_csv('datasets/pricerunner_aggregate.csv', header=0, names = headers)
#df = df[df.cat_id == 2612]
df.head()

In [None]:
df.info

# Construção de atributos (Features)

In [None]:
# Remove pontos, números e stopwords.
def clean_text(sentences):

    stop_words = {word: True for word in stopwords.words("english")}
    table = str.maketrans("", "", string.punctuation)
    tokens = []
    for text in sentences:
        words = []
        for word in text.lower().translate(table).split():
            #if word not in stop_words:
            if not word.isnumeric() and word not in stop_words:
                words.append(word)
        tokens.append(words)
    return tokens

# Carrega modelo Word2Vec.
def get_w2v(tokens, model_path="models/w2v", params=None, rebuild=False, save_model=True):

    if os.path.exists(model_path) and not rebuild:
        return Word2Vec.load(model_path)

    if params is not None:
        model = Word2Vec(
            sentences=tokens, vector_size=params["vector_size"], epochs=params["epochs"]
        )
    else:
        model = Word2Vec(sentences=tokens, workers=15)

    if save_model:
        if params is not None:
            keys = list(params.keys())
            keys.sort()
            sufix = "_".join([key + "-" + str(params[key]) for key in keys])
        else:
            sufix = "default.model"
        output = "models/w2v_" + sufix + ".model"
        model.save(output)

    return model

# Transforma dados em vetor com o modelo word2vec.
def w2v_transform(sentences, w2v):

    vecs = []
    for s in sentences:
        vecs_t = []
        for token in s:
            if token in w2v.wv:
                vecs_t.append(w2v.wv[token])
        if vecs_t:
            vecs.append(np.mean(vecs_t, axis=0))
        else:
            vecs.append(np.zeros(w2v.vector_size))
    return np.array(vecs)

# Carrega modelo Word2Vec.
def get_dv2(tokens, model_path="models/d2v", params=None, rebuild=False, save_model=True):

    if os.path.exists(model_path) and not rebuild:
        return Doc2Vec.load(model_path)

    if params is not None:
        model = Doc2Vec(
            sentences=tokens, vector_size=params["vector_size"], epochs=params["epochs"]
        )
    else:
        model = Doc2Vec(sentences=tokens, workers=15)

    if save_model:
        if params is not None:
            keys = list(params.keys())
            keys.sort()
            sufix = "_".join([key + "-" + str(params[key]) for key in keys])
        else:
            sufix = "default.model"
        output = "models/d2v_" + sufix + ".model"
        model.save(output)

    return model

# Transforma dados em vetor com o modelo word2vec.
def d2v_transform(n_docs, d2v):

    return np.array([ d2v.docvecs[v] for v in range(n_docs) ])

In [None]:
df = df[ (df.p_title.notna()) & (df.p_title.notnull()) ]

In [None]:
counts = df.cluster_id.value_counts()
freqs = [ counts[cid] for cid in df.cluster_id ]
df["freqs"] = freqs

In [None]:
df["freqs"].head()

In [None]:
feat_tokens = clean_text(df.p_title.values)

In [None]:
params = {
    "vector_size": 100,
    "sg": 1,
    "epochs": 5
}
model = get_w2v(feat_tokens, params=params)

In [None]:
feats = w2v_transform(feat_tokens, model)

In [None]:
feats.shape

# Classificação de produtos por categoria.

Verificando o desempenho dos classificadores por categoria.

In [None]:
r = np.random
seed = r.randint(0, 2147483647 * 2)

"""
classifiers = {
    "RandomForestClassifier": RandomForestClassifier(n_jobs=10, random_state=seed),
    "LogisticRegression": LogisticRegression(max_iter=400, multi_class='multinomial', n_jobs=10, random_state=seed),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=seed),
    "KNeighborsClassifier": KNeighborsClassifier(n_jobs=10),
    "SVC": SVC(random_state=seed)
}
"""
classifiers = {
    "RandomForestClassifier": RandomForestClassifier(n_jobs=10, random_state=seed),
    "LogisticRegression": LogisticRegression(max_iter=400, multi_class='multinomial', n_jobs=10, random_state=seed),
    "KNeighborsClassifier": KNeighborsClassifier(n_jobs=10)
}

In [None]:
def avaliation(classifiers, feats, target):

    estimators = {}
    seed = r.randint(0, 2147483647 * 2)
    n_folds = 5
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for alg in classifiers:
        
        # Validação cruzada.
        for train_index, test_index in kf.split(feats):

            X_train, X_test = feats[train_index], feats[test_index]
            y_train, y_test = target[train_index], target[test_index]
            # Clonando o classificador.
            clf = clone(classifiers[alg])
            # Predizendo 
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            if alg not in estimators:
                estimators[alg] = {}
                estimators[alg]["accs"] = []
                estimators[alg]["f1s"] = []
            estimators[alg]["accs"].append(accuracy_score(y_test, y_pred))
            estimators[alg]["f1s"].append(f1_score(y_test, y_pred, average="macro"))
    
        estimators[alg]["accs"] = np.array(estimators[alg]["accs"])
        estimators[alg]["f1s"] = np.array(estimators[alg]["f1s"])
        estimators[alg]["mean_accs"] = np.mean(estimators[alg]["accs"])
        estimators[alg]["mean_f1"] = np.mean(estimators[alg]["f1s"])
        estimators[alg]["std_accs"] = np.std(estimators[alg]["accs"])
        estimators[alg]["std_f1"] = np.std(estimators[alg]["f1s"])

    return estimators

#from pprint import pprint
#e = avaliation(classifiers, feats, df.cluster_id)
#pprint(e)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier

r = np.random
seed = r.randint(0, 2147483647 * 2)

"""
target = np.zeros((len(df), len(set(df.cluster_id))))
for i in df.cluster_id:
    target[i] = 1

X_train, X_test, y_train, y_test = train_test_split(feats, target)# df.cluster_id)
"""

X_train, X_test, y_train, y_test = train_test_split(feats, df.cluster_id)

#clf = MultiOutputClassifier(LogisticRegression(max_iter=400, multi_class='multinomial', n_jobs=10, random_state=seed))
#clf = KNeighborsClassifier(n_jobs=10)
#clf = RandomForestRegressor(random_state=seed, n_jobs=10)
clf = LogisticRegression(max_iter=400, multi_class='multinomial', n_jobs=10, random_state=seed)

print("Treinando...")
clf.fit(X_train, y_train)
print("Predizendo...")
y_pred = clf.predict(X_test)
y_pred

In [None]:
from skmultilearn.adapt import MLkNN
from skmultilearn.problem_transform import BinaryRelevance

r = np.random
seed = r.randint(0, 2147483647 * 2)

lgr = LogisticRegression(max_iter=400, multi_class='multinomial', n_jobs=10, random_state=seed)
#clf = BinaryRelevance(classifier=lgr, require_dense=[False, True])
#clf = MLkNN(k=3)

X_train, X_test, y_train, y_test = train_test_split(feats, np.array(df.cluster_id))

print("Treinando...")
clf.fit(X_train, y_train)
print("Predizendo...")
y_pred = clf.predict(X_test)
y_pred

In [None]:
df["vetores"] = list(feats.tolist())
df["vetores"].head()

In [64]:

r = np.random
seed = r.randint(0, 2147483647 * 2)

# Verificando a taxa de acerto pela quantidade de classes.
n_fold = 5
ids = list(set(df[(df.freqs >= n_fold) & (df.cat_id == 2612)].cluster_id))
len(ids)
limit = 50
for i in range(3, limit):
    # Escolhendo um sample de produtos aleatório.
    clusters_ids = np.random.choice(ids, i)
    set_sample = df.cluster_id.isin(clusters_ids)
    target = df[set_sample]["cluster_id"].values
    vecs = feats[set_sample]
    #clf = LogisticRegression(max_iter=400, multi_class='multinomial', n_jobs=10, random_state=seed)
    #clf = GradientBoostingClassifier(random_state=seed)
    #clf = KNeighborsClassifier(n_jobs=10)
    #clf = RandomForestClassifier(random_state=seed, n_jobs=5)
    clf = SVC()
    print("Number of classes: ", i, "F1: ", np.mean(cross_val_score(clf, vecs, target, cv=n_fold, n_jobs=10, scoring='f1_macro')))
    
    """
    # Separando o dado em treino e teste.
    X_train, X_test, y_train, y_test = train_test_split(vecs, target, test_size=0.2)
    # Classificando o dado.
    clf = LogisticRegression(max_iter=400, multi_class='multinomial', n_jobs=10, random_state=seed)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # Computando F1-Score.
    print(f1_score(y_test, y_pred, average="macro"))
    """


    


Number of classes:  3 F1:  0.5777777777777778
Number of classes:  4 F1:  0.7430952380952381
Number of classes:  5 F1:  0.6444444444444444
Number of classes:  6 F1:  0.3307142857142857
Number of classes:  7 F1:  0.20619666048237476
Number of classes:  8 F1:  0.23609126984126988
Number of classes:  9 F1:  0.2622222222222222
Number of classes:  10 F1:  0.1823352725705667
Number of classes:  11 F1:  0.3417359307359307
Number of classes:  12 F1:  0.12878417878417878
Number of classes:  13 F1:  0.13743339993339992
Number of classes:  14 F1:  0.19459856382933305
Number of classes:  15 F1:  0.13734236134236133
Number of classes:  16 F1:  0.17965575725444144
Number of classes:  17 F1:  0.12515707821590175
Number of classes:  18 F1:  0.22782330500596756
Number of classes:  19 F1:  0.17627754961088293
Number of classes:  20 F1:  0.17448452781786117
Number of classes:  21 F1:  0.1688008543743838
Number of classes:  22 F1:  0.10451590737305023
Number of classes:  23 F1:  0.09133976225536328
Number 

In [None]:
df[df.cluster_id.isin([45222])]

In [None]:
set(df.cat_id)