## Utils

Funções auxiliares

In [1]:
import numpy as np
from sklearn import preprocessing

def tags_list_to_array(tags_set):
  results = []
  for tags in tags_set:
    results.append(list(filter(None, tags.split(";"))))

  return results

def get_unique_tags_list(tags_set):

  results = []
  for tags in tags_set:
    for tag in tags:
      results.append(tag)

  return list(set(results))

def get_all_tags(tags_set):
  results = []
  for tags in tags_set:
    for tag in tags:
      results.append(tag)

  return results

def transform_array_to_binary(tags_set):
  le = preprocessing.LabelEncoder()
  uniques = get_unique_tags_list(dt.Tags)

  le.fit(uniques)
  
  length = len(uniques)
  results = []
  for tags in tags_set:
    encoder_result = le.transform(tags)

    arr = np.zeros((length,), dtype=int)
    for position in encoder_result:
      arr[position] = 1

    results.append(list(arr))
  return np.array(results)

Métrica de Hit ratio

In [2]:
import numpy as np

def hitk_convert_array_to_dict(arr):
  results = []

  for i in range(len(arr)):
    results.append(i)
    results.append(arr[i])

  return dict(zip(results[::2], results[1::2]))

def hitk_sort_dict(x):
  return sorted(x.items(), key=lambda kv: kv[1], reverse=True)

def hitk_get_k_keys(x, k):
  keys = []

  for item in x[:k]:
    key = item[0]
    value = item[1]

    if value > 0.5:
      keys.append(item[0])

  return keys

def hitk_get_number_of_correct_keys(keys_from_y_true, keys_from_y_scores):
  count = 0

  for i in range(min(len(keys_from_y_true), len(keys_from_y_scores))):
    if keys_from_y_scores[i] in keys_from_y_true:
      count += 1

  return count

def hitk_calculate(y_true, y_scores, k = 1):
  # converting from array format to dict format
  y_scores = hitk_convert_array_to_dict(y_scores)
  y_true = hitk_convert_array_to_dict(y_true)

  # sorting dict by value
  y_scores = hitk_sort_dict(y_scores)
  y_true = hitk_sort_dict(y_true)

  # getting k first keys from dict
  keys_from_y_scores = hitk_get_k_keys(y_scores, k)
  keys_from_y_true = hitk_get_k_keys(y_true, len(y_true)) 

  # process
  m = hitk_get_number_of_correct_keys(keys_from_y_true, keys_from_y_scores)

  # number of tags truely
  ng = len(keys_from_y_true)

  # number of tags recommend
  nr = len(keys_from_y_scores)

  if min(ng,nr) == 0:
    if ng == nr:
        return 1
    else:
        return 0
  else:
    return m / min(ng, nr)

def hitk(y_true, y_scores, k = 1):
  results = []
  
  for i in range(len(y_true)):
    value = hitk_calculate(y_true[i], y_scores[i], k = k)
  
    results.append(value)

  return np.mean(results)


In [3]:
from sklearn.metrics import average_precision_score

def ap(y_true, y_scores):
  results = []
  
  for i in range(len(y_true)):
    value = average_precision_score(y_true[i], y_scores[i])
  
    results.append(value)

  return np.mean(results)

## Pré processamento

Carregando bibliotecas

In [4]:
import pandas as pd
import numpy as np

Carregando base de dados

In [5]:
dt = pd.read_csv("Blogs.csv")

print('Total Rows: {}\nTotal Columns: {}'.format(dt.shape[0], dt.shape[1]))
print('Columns:', dt.columns.to_list())

Total Rows: 9397
Total Columns: 9
Columns: ['AdjustedTitle', 'ImagePath', 'Title', 'URL', 'Description', 'Tags', 'MicroName', 'Category', 'Content']


In [6]:
dt.head()

Unnamed: 0,AdjustedTitle,ImagePath,Title,URL,Description,Tags,MicroName,Category,Content
0,free diet plan from julia,Images/Free Diet Plan from Julia/1.jpg,Free Diet Plan from Julia,http://juliahavey.typepad.com,"Opinions on diet plans, weight loss, nutrition...",effect;weighting;egress;publication;supply;off...,weight issues,Health,Free Diet Plan from Julia11 de jul. de 2011 — ...
1,retro gaming,Images/Retro Gaming/5.jpg,Retro Gaming,http://www.racketboy.com/,Notes and thoughts on video gaming with the ol...,sega;comfort;cabinet;console platforms;solace;...,console platforms,Entertainment,"Top 90 Retro Gaming Blogs, Websites & Influenc..."
2,fancy pigeons india,Images/Fancy Pigeons India/5.jpg,Fancy Pigeons India,http://www.fancypigeonsindia.blogspot.com/,"Offers posts on exhibitions, sales, rare breed...",deary;parrots;ducky;skirt;razzing;snort;favori...,birds,Pets,Fancy Pigeons India
3,bookish gardener,Images/Bookish Gardener/0.jpg,Bookish Gardener,http://fortyfour.typepad.com/bookishgardener/,Reviews of gardening books and general comment...,hobbies and lifestyle;falco subbuteo;life styl...,gardening,Hobbies and Lifestyle,Bookish Gardener9 de jan. de 2020 — occasional...
4,goatworld,Images/GoatWorld/1.jpg,GoatWorld,http://goatworld.com/,Presents current goat news topics and archives...,deary;ducky;stock;favorite;capricorn;united ki...,livestock goats,Pets,Welcome to GoatWorld - GOATWORLD.COMEducationa...


Convertendo formato de tags de STRING para ARRAY

In [7]:
dt.Tags = tags_list_to_array(dt.Tags)

dt.shape

(9397, 9)

Convertendo formato de tags de ARRAY de string para ARRAY binário

In [8]:
y = transform_array_to_binary(dt.Tags)

y.shape

(9397, 3861)

## Modelo

In [9]:
import cv2
from tqdm import tqdm

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

Extraindo embeddings das entradas 

In [10]:

def get_image_features(images_paths, embed):
  images = []

  for filename in tqdm(images_paths):
    img = cv2.imread(filename)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224)) 
    img = np.array(img, dtype=np.uint8)

    images.append(img)
   
  return images
  result = embed([images[0]])

  for image in tqdm(images[1:]):
    embeddings_image = embed([image])
    result = tf.concat([result, embeddings_image], 0)
      
  return result.numpy()

In [11]:
def get_text_features(texts, embed):
  embeddings = embed(tf.constant([texts[0]]))

  for text in tqdm(texts[1:]):
    text = tf.constant([text])
    tmp = embed(text)
    embeddings = tf.concat([embeddings, tmp], 0)
      
  return embeddings.numpy()

In [13]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
embed = hub.KerasLayer(module_url, trainable=False, name='USE_Embedding')

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run Identity: Dst tensor is not initialized. [Op:Identity]

In [None]:
module_url = "https://tfhub.dev/google/imagenet/inception_v1/feature_vector/5"
INCEPTION_embed = hub.KerasLayer(module_url, trainable=False, name="Inception_Embedding")

In [None]:
titles_embeddings = embed(dt.Title.apply(str).to_list())
titles_embeddings = titles_embeddings.numpy()

In [11]:
descriptions_embeddings = embed(dt.Description.apply(str).to_list())
descriptions_embeddings = descriptions_embeddings.numpy()

In [None]:
contents_embeddings = embed(dt.Content.apply(str).to_list())
contents_embeddings = contents_embeddings.numpy()

In [16]:
images_embeddings = get_image_features(dt['ImagePath'].apply(str).to_numpy().tolist())

100%|██████████| 9397/9397 [01:11<00:00, 131.19it/s] 


In [12]:
from keras.models import Sequential
from keras.layers import Dense

#!pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [13]:
from sklearn.metrics import coverage_error
from sklearn.metrics import label_ranking_average_precision_score
from sklearn.metrics import label_ranking_loss
from sklearn.metrics import average_precision_score
from sklearn.metrics import hamming_loss

Training

In [19]:
def get_model(n_inputs, n_outputs):
  model = Sequential()
  model.add(Dense(n_inputs * 2, activation='relu'))
  model.add(Dense(100, activation='relu'))
  model.add(Dense(n_outputs, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam')
  return model

def evaluate_model(X, y, variable):
  results = list()

  n_inputs, n_outputs = X.shape[1], len(get_unique_tags_list(dt.Tags))

  mskf = MultilabelStratifiedKFold(n_splits=10, shuffle=True, random_state=0)
  
  fold_no = 1
  for train_ix, test_ix in mskf.split(X, y):
    print(f'Score for fold {fold_no}')
    
    train_ix = np.array(train_ix)
    test_ix = np.array(test_ix)

    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]
    
    model = get_model(n_inputs, n_outputs)
    
    model.fit(X_train, y_train, verbose=0, epochs=50)
    
    predictions = model.predict(X_test)

    print("\nMultilabel ranking metrics")

    print("Coverage Error:", coverage_error(y_test, predictions))
    print("Label Ranking Average Precision:", label_ranking_average_precision_score(y_test, predictions))
    print("Label Ranking Loss:", label_ranking_loss(y_test, predictions))
    print("Hit Ratio:", hitk(y_test, predictions, 5))
    print("Average Precision:", ap(y_test, predictions))

    predictions = predictions.round()
    
    print("Hamming Loss:", hamming_loss(y_test, predictions))

    print("\nNormal ranking metrics")
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')
    
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    
    fold_no += 1
  
  return results

In [20]:
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

In [21]:
variables = {
    #"Title": titles_embeddings,
    "Description": descriptions_embeddings,
    #"Content": contents_embeddings
}

results = []

for variable in variables.keys(): 
  print("Analysing variable", variable)

  # load dataset
  X = variables[variable]

  # evaluate model
  results = evaluate_model(X, y, variable)

Analysing variable Description
Score for fold 1:

Multilabel ranking metrics
Coverage Error: 379.2367303609342
Label Ranking Average Precision: 0.5961981405043405
Label Ranking Loss: 0.03461533741770859
Hit Ratio: 0.6430997876857749
Average Precision: 0.5961981405043406
Hamming Loss: 0.003941368060264026

Normal ranking metrics
Precision: 0.6091608850346111
Recall: 0.4579577987488071
F1 Score: 0.5087642697871951
Score for fold 2:

Multilabel ranking metrics
Coverage Error: 361.76464323748667
Label Ranking Average Precision: 0.6076189108601256
Label Ranking Loss: 0.032974189838617676
Hit Ratio: 0.6544728434504792
Average Precision: 0.6076189108601258
Hamming Loss: 0.0038433542160911703

Normal ranking metrics
Precision: 0.6168346980921395
Recall: 0.44789605945570227
F1 Score: 0.5001953616392396
Score for fold 3:

Multilabel ranking metrics
Coverage Error: 353.53177966101697
Label Ranking Average Precision: 0.6063662046143811
Label Ranking Loss: 0.03155817826033601
Hit Ratio: 0.651129943

In [None]:
pd.DataFrame(results).to_csv("Results.csv")

from google.colab import files

files.download('Results.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>