In [1]:
!pip install tensorflow



In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import f1_score

# Função para calcular o F1-score
def f1_metric(y_true, y_pred):
    precision = Precision()
    recall = Recall()
    precision.update_state(y_true, y_pred)
    recall.update_state(y_true, y_pred)
    p = precision.result().numpy()
    r = recall.result().numpy()
    return 2 * ((p * r) / (p + r + 1e-6))

# Carregue seus dados
data = pd.read_csv('../data/test_gender_fusion.csv')

# Pré-processamento de texto (tokenização e padding)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Description Tokenized']) 
sequences = tokenizer.texts_to_sequences(data['Description Tokenized'])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

max_length = max([len(s) for s in sequences])
X_padded = pad_sequences(sequences, maxlen=max_length, padding='post')

# Labels (gêneros)
Y = data.iloc[:, 7:18].values # Substitua -11 pelo índice da primeira coluna de gênero

# Divida os dados em conjuntos de treinamento e teste
from sklearn.model_selection import train_test_split
X_train_padded, X_test_padded, Y_train, Y_test = train_test_split(X_padded, Y, test_size=0.2, random_state=42)

# Crie o modelo RNN simples com uma camada GRU
embedding_dim = 100
num_genres=11

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(GRU(128, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_genres, activation='sigmoid'))

# Compile e treine o modelo
optimizer = RMSprop(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[Precision(), Recall()])

num_epochs = 15
history = model.fit(X_train_padded, Y_train, epochs=num_epochs, validation_data=(X_test_padded, Y_test), verbose=2)

# Avalie o modelo usando F1-score
y_pred = model.predict(X_test_padded)
y_pred = np.round(y_pred)
f1 = f1_score(Y_test, y_pred, average='micro')
print("F1-score: {:.2f}".format(f1))

Epoch 1/15
201/201 - 29s - loss: 0.4766 - precision: 0.4729 - recall: 0.1392 - val_loss: 0.4454 - val_precision: 0.5227 - val_recall: 0.2482 - 29s/epoch - 145ms/step
Epoch 2/15
201/201 - 28s - loss: 0.4586 - precision: 0.5197 - recall: 0.1432 - val_loss: 0.4476 - val_precision: 0.5227 - val_recall: 0.2482 - 28s/epoch - 139ms/step
Epoch 3/15
201/201 - 29s - loss: 0.4572 - precision: 0.5223 - recall: 0.1559 - val_loss: 0.4419 - val_precision: 0.5227 - val_recall: 0.2482 - 29s/epoch - 143ms/step
Epoch 4/15
201/201 - 30s - loss: 0.4557 - precision: 0.5245 - recall: 0.1659 - val_loss: 0.4422 - val_precision: 0.5227 - val_recall: 0.2482 - 30s/epoch - 149ms/step
Epoch 5/15
201/201 - 33s - loss: 0.4558 - precision: 0.5154 - recall: 0.1620 - val_loss: 0.4446 - val_precision: 0.5227 - val_recall: 0.2482 - 33s/epoch - 163ms/step
Epoch 6/15
201/201 - 33s - loss: 0.4555 - precision: 0.5222 - recall: 0.1690 - val_loss: 0.4429 - val_precision: 0.5227 - val_recall: 0.2482 - 33s/epoch - 166ms/step
Epoc