In [30]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop, Adam

train_data = pd.read_csv('../data/balanced_train.csv')
test_data = pd.read_csv('../data/balanced_test.csv')

genre_columns = train_data.columns.drop(['Name', 'Description Tokenized'])

# Tokenizar os textos
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train_data['Description Tokenized']) + list(test_data['Description Tokenized']))
word_index = tokenizer.word_index
num_words = len(word_index) + 1

X_train = tokenizer.texts_to_sequences(train_data['Description Tokenized'])
X_test = tokenizer.texts_to_sequences(test_data['Description Tokenized'])

maxlen = max(max([len(sequence) for sequence in X_train]), max([len(sequence) for sequence in X_test]))

# Padronizar os textos
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

#labels one-hot encoded
y_train = train_data[genre_columns].values
y_test = test_data[genre_columns].values


# modelo LSTM unidirecional
embedding_dim = 100

model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(LSTM(128, return_sequences=True, dropout=0.25, recurrent_dropout=0.25))
model.add(TimeDistributed(Dense(128, activation='relu')))
model.add(Dropout(0.5))
model.add(LSTM(128, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(len(genre_columns), activation='sigmoid'))

# Compilar e treinar o modelo
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

num_epochs = 15
history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test), verbose=2)

# 4. Avaliar o modelo
loss, accuracy = model.evaluate(X_test, y_test)
print('Loss:', loss)
print('Accuracy:', accuracy)

Epoch 1/15
160/160 - 119s - loss: 0.5643 - accuracy: 0.2618 - val_loss: 0.5585 - val_accuracy: 0.2648 - 119s/epoch - 744ms/step
Epoch 2/15
160/160 - 75s - loss: 0.5581 - accuracy: 0.2618 - val_loss: 0.5579 - val_accuracy: 0.2648 - 75s/epoch - 471ms/step
Epoch 3/15
160/160 - 91s - loss: 0.5579 - accuracy: 0.2608 - val_loss: 0.5597 - val_accuracy: 0.2648 - 91s/epoch - 568ms/step
Epoch 4/15
160/160 - 79s - loss: 0.5560 - accuracy: 0.2618 - val_loss: 0.5420 - val_accuracy: 0.2906 - 79s/epoch - 492ms/step
Epoch 5/15
160/160 - 107s - loss: 0.5255 - accuracy: 0.2913 - val_loss: 0.5166 - val_accuracy: 0.3172 - 107s/epoch - 667ms/step
Epoch 6/15
160/160 - 114s - loss: 0.4875 - accuracy: 0.3194 - val_loss: 0.4997 - val_accuracy: 0.3328 - 114s/epoch - 711ms/step
Epoch 7/15
160/160 - 134s - loss: 0.4450 - accuracy: 0.3499 - val_loss: 0.4597 - val_accuracy: 0.3445 - 134s/epoch - 839ms/step
Epoch 8/15
160/160 - 155s - loss: 0.4050 - accuracy: 0.3452 - val_loss: 0.4413 - val_accuracy: 0.3391 - 155s/e

In [31]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_binarized = np.round(y_pred)

report = classification_report(y_test, y_pred_binarized, target_names=genre_columns, zero_division=0)
print(report)

                         precision    recall  f1-score   support

                 Comedy       0.75      0.66      0.70       487
                  Crime       0.78      0.76      0.77       271
                  Drama       0.73      0.83      0.78       613
                Romance       0.67      0.64      0.66       270
   Action and Adventure       0.70      0.69      0.69       463
Documentary and History       0.77      0.58      0.66       172
   Family and Animation       0.70      0.68      0.69       275
     Fantasy and Sci-Fi       0.67      0.66      0.66       259
    Horror and Thriller       0.65      0.62      0.64       309

              micro avg       0.71      0.70      0.71      3119
              macro avg       0.71      0.68      0.69      3119
           weighted avg       0.71      0.70      0.70      3119
            samples avg       0.71      0.68      0.68      3119

