In [32]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import f1_score

train_data = pd.read_csv('../data/balanced_train.csv')
test_data = pd.read_csv('../data/balanced_test.csv')

genre_columns = train_data.columns.drop(['Name', 'Description Tokenized'])

# Tokenizar os textos
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(train_data['Description Tokenized']) + list(test_data['Description Tokenized']))
word_index = tokenizer.word_index
num_words = len(word_index) + 1

X_train = tokenizer.texts_to_sequences(train_data['Description Tokenized'])
X_test = tokenizer.texts_to_sequences(test_data['Description Tokenized'])

maxlen = max(max([len(sequence) for sequence in X_train]), max([len(sequence) for sequence in X_test]))

# Padronizar os textos
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

#labels one-hot encoded
y_train = train_data[genre_columns].values
y_test = test_data[genre_columns].values
# Crie o modelo RNN simples com uma camada GRU
embedding_dim = 100

model = Sequential()
model.add(Embedding(num_words, embedding_dim, input_length=maxlen))
model.add(GRU(128, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(genre_columns), activation='sigmoid'))

# Compile e treine o modelo
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[Precision(), Recall()])

num_epochs = 15
history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test), verbose=2)


Epoch 1/15
160/160 - 62s - loss: 0.5586 - precision_14: 0.5192 - recall_14: 0.1329 - val_loss: 0.5247 - val_precision_14: 0.5777 - val_recall_14: 0.2193 - 62s/epoch - 388ms/step
Epoch 2/15
160/160 - 43s - loss: 0.4795 - precision_14: 0.6310 - recall_14: 0.3830 - val_loss: 0.4368 - val_precision_14: 0.6759 - val_recall_14: 0.4755 - 43s/epoch - 268ms/step
Epoch 3/15
160/160 - 35s - loss: 0.3923 - precision_14: 0.7014 - recall_14: 0.5792 - val_loss: 0.4004 - val_precision_14: 0.6940 - val_recall_14: 0.5906 - 35s/epoch - 221ms/step
Epoch 4/15
160/160 - 41s - loss: 0.3234 - precision_14: 0.7612 - recall_14: 0.6918 - val_loss: 0.3730 - val_precision_14: 0.7144 - val_recall_14: 0.6345 - 41s/epoch - 254ms/step
Epoch 5/15
160/160 - 50s - loss: 0.2681 - precision_14: 0.8077 - recall_14: 0.7632 - val_loss: 0.3562 - val_precision_14: 0.7402 - val_recall_14: 0.6743 - 50s/epoch - 311ms/step
Epoch 6/15
160/160 - 47s - loss: 0.2314 - precision_14: 0.8346 - recall_14: 0.8023 - val_loss: 0.3597 - val_pr

In [33]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_binarized = np.round(y_pred)


report = classification_report(y_test, y_pred_binarized, target_names=genre_columns, zero_division=0)
print(report)

                         precision    recall  f1-score   support

                 Comedy       0.72      0.72      0.72       487
                  Crime       0.80      0.79      0.79       271
                  Drama       0.82      0.72      0.76       613
                Romance       0.86      0.51      0.64       270
   Action and Adventure       0.82      0.73      0.77       463
Documentary and History       0.72      0.67      0.69       172
   Family and Animation       0.77      0.74      0.76       275
     Fantasy and Sci-Fi       0.79      0.76      0.77       259
    Horror and Thriller       0.77      0.65      0.70       309

              micro avg       0.79      0.71      0.74      3119
              macro avg       0.79      0.70      0.74      3119
           weighted avg       0.79      0.71      0.74      3119
            samples avg       0.72      0.69      0.69      3119



In [34]:
from sklearn.metrics import precision_recall_curve, roc_curve, auc

y_pred_prob = model.predict(X_test)

pr_curves = {}
roc_curves = {}
for i, genre in enumerate(genre_columns):
    precision, recall, _ = precision_recall_curve(y_test[:, i], y_pred_prob[:, i])
    fpr, tpr, _ = roc_curve(y_test[:, i], y_pred_prob[:, i])
    pr_curves[genre] = (precision, recall)
    roc_curves[genre] = (fpr, tpr)
    
auc_pr = []
auc_roc = []
for i, genre in enumerate(genre_columns):
    ap = auc(pr_curves[genre][1], pr_curves[genre][0])
    ar = auc(roc_curves[genre][0], roc_curves[genre][1])
    auc_pr.append(ap)
    auc_roc.append(ar)

results = pd.DataFrame({'genre': genre_columns, 'AUC-PR': auc_pr, 'AUC-ROC': auc_roc})
print(results)


                     genre    AUC-PR   AUC-ROC
0                   Comedy  0.836712  0.880782
1                    Crime  0.855340  0.940698
2                    Drama  0.866818  0.883721
3                  Romance  0.780106  0.905728
4     Action and Adventure  0.860132  0.908932
5  Documentary and History  0.786522  0.947318
6     Family and Animation  0.847249  0.930048
7       Fantasy and Sci-Fi  0.853719  0.929730
8      Horror and Thriller  0.775028  0.889214


In [35]:
from sklearn.metrics import confusion_matrix

# Obter as previsões do modelo
y_pred_binarized = np.round(model.predict(X_test))

# Calcular a matriz de confusão para cada classe
for i, genre in enumerate(genre_columns):
    cm = confusion_matrix(y_test[:, i], y_pred_binarized[:, i])
    print(f'Confusion matrix for {genre}:')
    print(cm)
    print('------------------------')

Confusion matrix for Comedy:
[[657 136]
 [134 353]]
------------------------
Confusion matrix for Crime:
[[954  55]
 [ 56 215]]
------------------------
Confusion matrix for Drama:
[[568  99]
 [173 440]]
------------------------
Confusion matrix for Romance:
[[987  23]
 [132 138]]
------------------------
Confusion matrix for Action and Adventure:
[[745  72]
 [125 338]]
------------------------
Confusion matrix for Documentary and History:
[[1064   44]
 [  57  115]]
------------------------
Confusion matrix for Family and Animation:
[[945  60]
 [ 71 204]]
------------------------
Confusion matrix for Fantasy and Sci-Fi:
[[970  51]
 [ 63 196]]
------------------------
Confusion matrix for Horror and Thriller:
[[910  61]
 [108 201]]
------------------------
