In [6]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split


df = pd.read_csv('../data/test.csv')
# 1. Pré-processar os dados
# Carregue seus datasets balanceados
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

category_columns = train_data.columns.drop(['Name', 'Description Tokenized'])

# Defina os parâmetros de pré-processamento
max_features = 10000  # Número máximo de palavras a serem usadas (palavras mais frequentes)
maxlen = 200  # Número máximo de palavras no texto

# Tokenize os textos
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_data['Description Tokenized'])

X_train = tokenizer.texts_to_sequences(train_data['Description Tokenized'])
X_test = tokenizer.texts_to_sequences(test_data['Description Tokenized'])

# Padronize os textos
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

# Obtenha os labels one-hot encoded
y_train = train_data[category_columns].values
y_test = test_data[category_columns].values

# 2. Construir o modelo
embedding_dim = 128  # Dimensão do vetor de embedding
lstm_units = 64  # Unidades LSTM

model = Sequential([
    Embedding(max_features, embedding_dim, input_length=maxlen),
    Bidirectional(LSTM(lstm_units, return_sequences=True)),
    Bidirectional(LSTM(lstm_units)),
    Dense(len(category_columns), activation='sigmoid')
])

# 3. Treinar o modelo
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

batch_size = 32
epochs = 10

model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

# 4. Avaliar o modelo
loss, accuracy = model.evaluate(X_test, y_test)
print('Loss:', loss)
print('Accuracy:', accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 0.5626355409622192
Accuracy: 0.42265623807907104


In [None]:
!pip install torch

In [7]:
from sklearn.metrics import classification_report

# Prever e converter as previsões para o formato binarizado
y_pred = model.predict(X_test)
y_pred_binarized = np.round(y_pred)

# Calcular e exibir métricas de avaliação
report = classification_report(y_test, y_pred_binarized, target_names=category_columns, zero_division=0)
print(report)

                         precision    recall  f1-score   support

                 Comedy       0.70      0.76      0.73       487
                  Crime       0.72      0.82      0.76       271
                  Drama       0.79      0.81      0.80       613
                Romance       0.69      0.67      0.68       270
   Action and Adventure       0.80      0.73      0.76       463
Documentary and History       0.77      0.65      0.70       172
   Family and Animation       0.77      0.70      0.73       275
     Fantasy and Sci-Fi       0.71      0.76      0.74       259
    Horror and Thriller       0.63      0.68      0.65       309

              micro avg       0.73      0.74      0.74      3119
              macro avg       0.73      0.73      0.73      3119
           weighted avg       0.74      0.74      0.74      3119
            samples avg       0.72      0.73      0.71      3119



In [12]:
from sklearn.metrics import f1_score, average_precision_score, precision_recall_curve, precision_score, recall_score

y_pred = model.predict(X_test)
y_pred = np.round(y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
precision_micro = precision_score(y_test, y_pred, average='micro')
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_micro = recall_score(y_test, y_pred, average='micro')
recall_macro = recall_score(y_test, y_pred, average='macro')
print("F1-score (micro): {:.2f}".format(f1_micro))
print("F1-score (macro): {:.2f}".format(f1_macro))
print("Precision (micro): {:.2f}".format(precision_micro))
print("Precision (macro): {:.2f}".format(precision_macro))
print("Recall (micro): {:.2f}".format(recall_micro))
print("Recall (macro): {:.2f}".format(recall_macro))

y_pred = model.predict(X_test)
f1_scores_per_class = f1_score(y_test, np.round(y_pred), average=None)
auc_pr_per_class = average_precision_score(y_test, y_pred, average=None)
for i, category in enumerate(category_columns):
    print("Class: {}".format(category))
    print("F1-score: {:.2f}".format(f1_scores_per_class[i]))
    print("AUC-PR: {:.2f}".format(auc_pr_per_class[i]))
    print("\n")

F1-score (micro): 0.57
F1-score (macro): 0.50
Precision (micro): 0.63
Precision (macro): 0.57
Recall (micro): 0.53
Recall (macro): 0.46
Class: Comedy
F1-score: 0.55
AUC-PR: 0.63


Class: Crime
F1-score: 0.58
AUC-PR: 0.63


Class: Drama
F1-score: 0.74
AUC-PR: 0.77


Class: Romance
F1-score: 0.41
AUC-PR: 0.46


Class: Action and Adventure
F1-score: 0.59
AUC-PR: 0.66


Class: Documentary and History
F1-score: 0.38
AUC-PR: 0.39


Class: Family and Animation
F1-score: 0.37
AUC-PR: 0.39


Class: Fantasy and Sci-Fi
F1-score: 0.44
AUC-PR: 0.39


Class: Horror and Thriller
F1-score: 0.48
AUC-PR: 0.50


