In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop

train_data = pd.read_csv('../data/balanced_train.csv')
test_data = pd.read_csv('../data/balanced_test.csv')

genre_columns = train_data.columns.drop(['Name', 'Description Tokenized'])

# Tokenizar os textos
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train_data['Description Tokenized']) + list(test_data['Description Tokenized']))
word_index = tokenizer.word_index
num_words = len(word_index) + 1

X_train = tokenizer.texts_to_sequences(train_data['Description Tokenized'])
X_test = tokenizer.texts_to_sequences(test_data['Description Tokenized'])

maxlen = max(max([len(sequence) for sequence in X_train]), max([len(sequence) for sequence in X_test]))

# Padronizar os textos
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

#labels one-hot encoded
y_train = train_data[genre_columns].values
y_test = test_data[genre_columns].values


# 2. Construir o modelo
embedding_dim = 128  # Dimensão do vetor de embedding
lstm_units = 128  # Unidades LSTM

model = Sequential([
    Embedding(num_words, embedding_dim, input_length=maxlen),
    Bidirectional(LSTM(lstm_units, return_sequences=True)),
    Dropout(0.25),
    Bidirectional(LSTM(lstm_units)),
    Dense(len(genre_columns), activation='sigmoid')
])

# 3. Treinar o modelo
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

batch_size = 32
epochs = 15

model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

# 4. Avaliar o modelo
loss, accuracy = model.evaluate(X_test, y_test)
print('Loss:', loss)
print('Accuracy:', accuracy)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Loss: 0.7414653897285461
Accuracy: 0.38984376192092896


In [2]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_binarized = np.round(y_pred)


report = classification_report(y_test, y_pred_binarized, target_names=genre_columns, zero_division=0)
print(report)

                         precision    recall  f1-score   support

                 Comedy       0.69      0.80      0.74       487
                  Crime       0.75      0.81      0.78       271
                  Drama       0.73      0.80      0.77       613
                Romance       0.75      0.67      0.71       270
   Action and Adventure       0.78      0.76      0.77       463
Documentary and History       0.81      0.55      0.65       172
   Family and Animation       0.72      0.71      0.71       275
     Fantasy and Sci-Fi       0.76      0.69      0.72       259
    Horror and Thriller       0.72      0.67      0.69       309

              micro avg       0.74      0.74      0.74      3119
              macro avg       0.74      0.72      0.73      3119
           weighted avg       0.74      0.74      0.74      3119
            samples avg       0.72      0.72      0.71      3119



In [3]:
from sklearn.metrics import precision_recall_curve, roc_curve, auc

y_pred_prob = model.predict(X_test)

pr_curves = {}
roc_curves = {}
for i, genre in enumerate(genre_columns):
    precision, recall, _ = precision_recall_curve(y_test[:, i], y_pred_prob[:, i])
    fpr, tpr, _ = roc_curve(y_test[:, i], y_pred_prob[:, i])
    pr_curves[genre] = (precision, recall)
    roc_curves[genre] = (fpr, tpr)
    
auc_pr = []
auc_roc = []
for i, genre in enumerate(genre_columns):
    ap = auc(pr_curves[genre][1], pr_curves[genre][0])
    ar = auc(roc_curves[genre][0], roc_curves[genre][1])
    auc_pr.append(ap)
    auc_roc.append(ar)

results = pd.DataFrame({'genre': genre_columns, 'AUC-PR': auc_pr, 'AUC-ROC': auc_roc})
print(results)



                     genre    AUC-PR   AUC-ROC
0                   Comedy  0.699929  0.840499
1                    Crime  0.823727  0.917444
2                    Drama  0.781922  0.824256
3                  Romance  0.725966  0.870682
4     Action and Adventure  0.787232  0.876115
5  Documentary and History  0.723734  0.915566
6     Family and Animation  0.721813  0.880213
7       Fantasy and Sci-Fi  0.735604  0.879817
8      Horror and Thriller  0.680893  0.859988


In [4]:
from sklearn.metrics import confusion_matrix

# Obter as previsões do modelo
y_pred_binarized = np.round(model.predict(X_test))

# Calcular a matriz de confusão para cada classe
for i, genre in enumerate(genre_columns):
    cm = confusion_matrix(y_test[:, i], y_pred_binarized[:, i])
    print(f'Confusion matrix for {genre}:')
    print(cm)
    print('------------------------')

Confusion matrix for Comedy:
[[617 176]
 [ 99 388]]
------------------------
Confusion matrix for Crime:
[[935  74]
 [ 51 220]]
------------------------
Confusion matrix for Drama:
[[489 178]
 [120 493]]
------------------------
Confusion matrix for Romance:
[[948  62]
 [ 88 182]]
------------------------
Confusion matrix for Action and Adventure:
[[718  99]
 [113 350]]
------------------------
Confusion matrix for Documentary and History:
[[1086   22]
 [  78   94]]
------------------------
Confusion matrix for Family and Animation:
[[929  76]
 [ 80 195]]
------------------------
Confusion matrix for Fantasy and Sci-Fi:
[[963  58]
 [ 79 180]]
------------------------
Confusion matrix for Horror and Thriller:
[[889  82]
 [103 206]]
------------------------
