## Utils

Funções auxiliares

In [69]:
import numpy as np
from sklearn import preprocessing

def tags_list_to_array(tags_set):
  results = []
  for tags in tags_set:
    results.append(list(filter(None, tags.split(";"))))

  return results

def get_unique_tags_list(tags_set):

  results = []
  for tags in tags_set:
    for tag in tags:
      results.append(tag)

  return list(set(results))

def get_all_tags(tags_set):
  results = []
  for tags in tags_set:
    for tag in tags:
      results.append(tag)

  return results

def transform_array_to_binary(tags_set):
  le = preprocessing.LabelEncoder()
  uniques = get_unique_tags_list(dt.Tags)

  le.fit(uniques)
  
  length = len(uniques)
  results = []
  for tags in tags_set:
    encoder_result = le.transform(tags)

    arr = np.zeros((length,), dtype=int)
    for position in encoder_result:
      arr[position] = 1

    results.append(list(arr))
  return np.array(results)

Métrica de Hit ratio

In [74]:
import numpy as np

def hitk_convert_array_to_dict(arr):
  results = []

  for i in range(len(arr)):
    results.append(i)
    results.append(arr[i])

  return dict(zip(results[::2], results[1::2]))

def hitk_sort_dict(x):
  return sorted(x.items(), key=lambda kv: kv[1], reverse=True)

def hitk_get_k_keys(x, k):
  keys = []

  for item in x[:k]:
    key = item[0]
    value = item[1]

    if value > 0.5:
      keys.append(item[0])

  return keys

def hitk_get_number_of_correct_keys(keys_from_y_true, keys_from_y_scores):
  count = 0

  for i in range(min(len(keys_from_y_true), len(keys_from_y_scores))):
    if keys_from_y_scores[i] in keys_from_y_true:
      count += 1

  return count

def hitk_calculate(y_true, y_scores, k = 1):
  # converting from array format to dict format
  y_scores = hitk_convert_array_to_dict(y_scores)
  y_true = hitk_convert_array_to_dict(y_true)

  print(y_true, y_scores)

  # sorting dict by value
  y_scores = hitk_sort_dict(y_scores)
  y_true = hitk_sort_dict(y_true)

  print(y_true, y_scores)

  # getting k first keys from dict
  keys_from_y_scores = hitk_get_k_keys(y_scores, k)
  keys_from_y_true = hitk_get_k_keys(y_true, len(y_true)) 

  print(keys_from_y_true, keys_from_y_scores)

  # process
  m = hitk_get_number_of_correct_keys(keys_from_y_true, keys_from_y_scores)

  # number of tags truely
  ng = len(keys_from_y_true)

  # number of tags recommend
  nr = len(keys_from_y_scores)

  print(m, ng, nr)

  return m / min(ng, nr)

def hitk(y_true, y_scores, k = 1):
  results = []
  
  for i in range(len(y_true)):
    value = hitk_calculate(y_true[i], y_scores[i], k = k)
  
    results.append(value)

  return np.mean(results)


## Pré processamento

Carregando bibliotecas

In [65]:
import pandas as pd
import numpy as np

Carregando base de dados

In [66]:
dt = pd.read_csv("Blogs.csv")

dt.shape

(9397, 9)

Convertendo formato de tags de STRING para ARRAY

In [67]:
dt.Tags = tags_list_to_array(dt.Tags)

dt.shape

(9397, 9)

Convertendo formato de tags de ARRAY de string para ARRAY binário

In [70]:
y = transform_array_to_binary(dt.Tags)

y.shape

(9397, 3861)

## Modelo

Extraindo embeddings das entradas 

In [71]:
import tensorflow as tf

import tensorflow_hub as hub

import tensorflow_text

In [75]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
embed = hub.KerasLayer(module_url, trainable=False, name='USE_Embedding')

In [None]:
titles_embeddings = embed(dt.Title.apply(str).to_list())
titles_embeddings = titles_embeddings.numpy()

In [1]:
descriptions_embeddings = embed(dt.Description.apply(str).to_list())
descriptions_embeddings = descriptions_embeddings.numpy()

NameError: name 'embed' is not defined

In [None]:
contents_embeddings = embed(dt.Content.apply(str).to_list())
contents_embeddings = contents_embeddings.numpy()

In [None]:
from keras.models import Sequential
from keras.layers import Dense

!pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.metrics import label_ranking_loss

Training

In [None]:
def get_model(n_inputs, n_outputs):
  model = Sequential()
  model.add(Dense(n_inputs * 2, activation='relu'))
  model.add(Dense(100, activation='relu'))
  model.add(Dense(n_outputs, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam')
  return model

def evaluate_model(X, y, variable):
  results = list()

  n_inputs, n_outputs = X.shape[1], len(get_unique_tags_list(dt.Tags))

  mskf = MultilabelStratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  
  fold_no = 1
  for train_ix, test_ix in mskf.split(X, y):
    train_ix = np.array(train_ix)
    test_ix = np.array(test_ix)

    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]
    
    model = get_model(n_inputs, n_outputs)
    
    model.fit(X_train, y_train, verbose=0, epochs=50)
    
    predictions = model.predict(X_test)

    print("Multilabel ranking metrics")

    print("Coverage Error:", coverage_error(y_test, predictions))
    print("Label Ranking Average Precision:", label_ranking_average_precision_score(y_test, predictions))
    print("Label Ranking Loss:", label_ranking_loss(y_test, predictions))

    predictions = predictions.round()

    print("Normal ranking metrics")
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')

    results.append([precision, recall, f1, variable])

    print(f'Score for fold {fold_no}: Precision of {precision}; Recall of {recall}; F1 Score of {f1}')
  
    fold_no += 1
  
  return results

In [None]:
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

variables = {
    #"Title": titles_embeddings,
    "Description": descriptions_embeddings,
    #"Content": contents_embeddings
}

results = []

for variable in variables.keys(): 
  print("Analysing variable", variable)

  # load dataset
  X = variables[variable]

  # evaluate model
  results = evaluate_model(X, y, variable)

  # summarize performance
  results_dt = pd.DataFrame(results)

  print('MeanPrecision: %.3f (%.3f); MeanRecall: %.3f (%.3f); MeanF1: %.3f (%.3f)' % (results_dt[0].mean(), results_dt[0].std(), results_dt[1].mean(), results_dt[1].std(), results_dt[2].mean(), results_dt[2].std()))

Analysing variable Description
Métricas de classificação de várias etiquetas
Coverage Error: 350.931696905016
Label Ranking Average Precision: 0.5898945259385115
Label Ranking Loss: 0.02851307674903198
Métricas de classificação de tradicionais
Score for fold 1: Precision of 0.6052585057117903; Recall of 0.4669145382045479; F1 Score of 0.5143873941187559
Métricas de classificação de várias etiquetas
Coverage Error: 356.74179894179895
Label Ranking Average Precision: 0.5932897159238014
Label Ranking Loss: 0.03164898992617977
Métricas de classificação de tradicionais
Score for fold 2: Precision of 0.5899162377342227; Recall of 0.4754247248637675; F1 Score of 0.5094663068418248
Métricas de classificação de várias etiquetas
Coverage Error: 361.11689691817213
Label Ranking Average Precision: 0.5985762140251871
Label Ranking Loss: 0.03231151882009875
Métricas de classificação de tradicionais
Score for fold 3: Precision of 0.6051398204970886; Recall of 0.4564138151924411; F1 Score of 0.5041634

In [None]:
pd.DataFrame(results).to_csv("Results.csv")

from google.colab import files

files.download('Results.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
results = pd.read_csv("Results.csv", index_col=0)

results.shape

(3, 10)