In [1]:
!pip install tensorflow



In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Bidirectional

from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/test_gender_fusion.csv')
X = df["Description Tokenized"]
Y = df.iloc[:, 7:18]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [8]:

# Parâmetros para preprocessamento e a rede LSTM
vocab_size = 10000  # Tamanho do vocabulário
embedding_dim = 64  # Dimensão do embedding
max_length = 150  # Comprimento máximo das sequências
trunc_type = 'post'  # Truncar sequências após o comprimento máximo
padding_type = 'post'  # Adicionar padding após a sequência
oov_tok = '<OOV>'  # Token para palavras fora do vocabulário
num_genres = 11  # Número de gêneros

# Tokenize e pad as sequências
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Crie o modelo LSTM
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(embedding_dim, return_sequences=True)))
model.add(TimeDistributed(Dense(embedding_dim, activation='relu')))
model.add(Dropout(0.5))
model.add(LSTM(embedding_dim))
model.add(Dense(num_genres, activation='sigmoid'))  # Função de ativação 'sigmoid' para classificação multirrótulo

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Treine o modelo
num_epochs = 10
history = model.fit(X_train_padded, Y_train, epochs=num_epochs, validation_data=(X_test_padded, Y_test), verbose=2)

# Avalie o modelo
scores = model.evaluate(X_test_padded, Y_test, verbose=0)
print("Acurácia: %.2f%%" % (scores[1] * 100))

Epoch 1/10
201/201 - 116s - loss: 0.4662 - accuracy: 0.3123 - val_loss: 0.4430 - val_accuracy: 0.3059 - 116s/epoch - 576ms/step
Epoch 2/10
201/201 - 106s - loss: 0.4516 - accuracy: 0.3159 - val_loss: 0.4426 - val_accuracy: 0.3059 - 106s/epoch - 528ms/step
Epoch 3/10
201/201 - 120s - loss: 0.4516 - accuracy: 0.3159 - val_loss: 0.4415 - val_accuracy: 0.3059 - 120s/epoch - 599ms/step
Epoch 4/10
201/201 - 109s - loss: 0.4516 - accuracy: 0.3159 - val_loss: 0.4431 - val_accuracy: 0.3059 - 109s/epoch - 543ms/step
Epoch 5/10
201/201 - 110s - loss: 0.4516 - accuracy: 0.3159 - val_loss: 0.4426 - val_accuracy: 0.3059 - 110s/epoch - 547ms/step
Epoch 6/10
201/201 - 109s - loss: 0.4513 - accuracy: 0.3159 - val_loss: 0.4422 - val_accuracy: 0.3059 - 109s/epoch - 543ms/step
Epoch 7/10
201/201 - 106s - loss: 0.4512 - accuracy: 0.3159 - val_loss: 0.4438 - val_accuracy: 0.3059 - 106s/epoch - 528ms/step
Epoch 8/10
201/201 - 103s - loss: 0.4517 - accuracy: 0.3159 - val_loss: 0.4422 - val_accuracy: 0.3059 - 