In [1]:
!pip install tensorflow



In [12]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Bidirectional
from tensorflow.keras.optimizers import RMSprop

from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/test_gender_fusion.csv')
X = df["Description Tokenized"]
Y = df.iloc[:, 7:18]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)



In [14]:


# Parâmetros para preprocessamento e a rede LSTM
vocab_size = 15000
embedding_dim = 100
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
num_genres = 11

# Tokenize e pad as sequências
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Carregue os embeddings pré-treinados (GloVe)
embedding_index = {}
with open("../data/glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coefs

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Crie o modelo LSTM
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.25, recurrent_dropout=0.25)))
model.add(TimeDistributed(Dense(128, activation='relu')))
model.add(Dropout(0.5))
model.add(LSTM(128, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(num_genres, activation='sigmoid'))

# Compile e treine o modelo
optimizer = RMSprop(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

num_epochs = 15
history = model.fit(X_train_padded, Y_train, epochs=num_epochs, validation_data=(X_test_padded, Y_test), verbose=2)

# Avalie o modelo
scores = model.evaluate(X_test_padded, Y_test, verbose=0)
print("Acurácia: %.2f%%" % (scores[1] * 100))

Epoch 1/15
201/201 - 998s - loss: 0.4601 - accuracy: 0.3150 - val_loss: 0.4443 - val_accuracy: 0.3059 - 998s/epoch - 5s/step
Epoch 2/15
201/201 - 1007s - loss: 0.4524 - accuracy: 0.3159 - val_loss: 0.4416 - val_accuracy: 0.3059 - 1007s/epoch - 5s/step
Epoch 3/15
201/201 - 994s - loss: 0.4520 - accuracy: 0.3159 - val_loss: 0.4425 - val_accuracy: 0.3059 - 994s/epoch - 5s/step
Epoch 4/15
201/201 - 1022s - loss: 0.4520 - accuracy: 0.3159 - val_loss: 0.4422 - val_accuracy: 0.3059 - 1022s/epoch - 5s/step
Epoch 5/15
201/201 - 1013s - loss: 0.4517 - accuracy: 0.3159 - val_loss: 0.4419 - val_accuracy: 0.3059 - 1013s/epoch - 5s/step
Epoch 6/15
201/201 - 796s - loss: 0.4517 - accuracy: 0.3159 - val_loss: 0.4429 - val_accuracy: 0.3059 - 796s/epoch - 4s/step
Epoch 7/15
201/201 - 784s - loss: 0.4516 - accuracy: 0.3159 - val_loss: 0.4426 - val_accuracy: 0.3059 - 784s/epoch - 4s/step
Epoch 8/15
201/201 - 757s - loss: 0.4518 - accuracy: 0.3159 - val_loss: 0.4417 - val_accuracy: 0.3059 - 757s/epoch - 4s