In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn


train_data = pd.read_csv('../data/balanced_train.csv')

test_data = pd.read_csv('../data/balanced_test.csv')


genre_columns = train_data.columns.drop(['Name', 'Description Tokenized'])

# Tokenizador BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Hiperparâmetros
max_len = 40
batch_size = 16
epochs = 5
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Classe Dataset personalizada
class MovieDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Description Tokenized']
        labels = self.data.iloc[idx][genre_columns]

        tokens = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': tokens['input_ids'][0],
            'attention_mask': tokens['attention_mask'][0],
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Função de perda de entropia cruzada binária
loss_fn = nn.BCEWithLogitsLoss()

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses)

def evaluate(model, data_loader, loss_fn, device):
    model = model.eval()
    losses = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            losses.append(loss.item())
    return np.mean(losses)

def predict(model, data_loader, device):
    model = model.eval()
    predictions = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.sigmoid(logits).cpu().numpy())

    return np.array(predictions)

# Preparar os DataLoaders
train_dataset = MovieDataset(train_data, tokenizer, max_len)
test_dataset = MovieDataset(test_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Carreguar o modelo pré-treinado e modifique a camada de saída
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=9).to(device)
model.classifier = nn.Linear(model.config.hidden_size, 9).to(device)

# Otimizador e programador
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Treinamento e avaliação
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device, scheduler)
    print(f"Epoch {epoch + 1}/{epochs} | Train loss: {train_loss}")

    test_loss = evaluate(model, test_loader, loss_fn, device)
    print(f"Epoch {epoch + 1}/{epochs} | Test loss: {test_loss}")

# previsões no conjunto de teste
predictions = predict(model, test_loader, device)
predictions_binary = (predictions > 0.5).astype(int)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/5 | Train loss: 0.5006270631216466
Epoch 1/5 | Test loss: 0.435180227458477
Epoch 2/5 | Train loss: 0.40478939963504673
Epoch 2/5 | Test loss: 0.39761337377130984
Epoch 3/5 | Train loss: 0.3467337834648788
Epoch 3/5 | Test loss: 0.37180393375456333
Epoch 4/5 | Train loss: 0.30355998980812726
Epoch 4/5 | Test loss: 0.3579508002847433
Epoch 5/5 | Train loss: 0.27615680997259917
Epoch 5/5 | Test loss: 0.3555926736444235


In [3]:
# Avalie o desempenho do modelo usando a métrica F1
test_labels = test_data[genre_columns].values  # Ajuste de acordo com os gêneros no seu conjunto de dados
f1_macro = f1_score(test_labels, predictions_binary, average='macro')
f1_samples = f1_score(test_labels, predictions_binary, average='samples')

print(f"F1 Macro: {f1_macro}")
print(f"F1 Samples: {f1_samples}")

F1 Macro: 0.6980843183987057
F1 Samples: 0.6809393601190477


In [5]:
from sklearn.metrics import classification_report

# Supondo que test_labels seja um numpy array contendo os rótulos verdadeiros do conjunto de teste
report = classification_report(test_labels, predictions_binary, target_names=genre_columns, zero_division=0)
print(report)

                         precision    recall  f1-score   support

                 Comedy       0.71      0.62      0.67       487
                  Crime       0.81      0.72      0.76       271
                  Drama       0.76      0.77      0.77       613
                Romance       0.68      0.55      0.61       270
   Action and Adventure       0.78      0.73      0.75       463
Documentary and History       0.73      0.65      0.69       172
   Family and Animation       0.76      0.69      0.72       275
     Fantasy and Sci-Fi       0.71      0.65      0.68       259
    Horror and Thriller       0.68      0.61      0.64       309

              micro avg       0.74      0.68      0.71      3119
              macro avg       0.73      0.67      0.70      3119
           weighted avg       0.74      0.68      0.71      3119
            samples avg       0.74      0.68      0.68      3119



In [9]:
from sklearn.metrics import precision_recall_curve, roc_curve, auc


pr_curves = {}
roc_curves = {}
for i, genre in enumerate(genre_columns):
    precision, recall, _ = precision_recall_curve(test_labels[:, i], predictions[:, i])
    fpr, tpr, _ = roc_curve(test_labels[:, i], predictions[:, i])
    pr_curves[genre] = (precision, recall)
    roc_curves[genre] = (fpr, tpr)
    
auc_pr = []
auc_roc = []
for i, genre in enumerate(genre_columns):
    ap = auc(pr_curves[genre][1], pr_curves[genre][0])
    ar = auc(roc_curves[genre][0], roc_curves[genre][1])
    auc_pr.append(ap)
    auc_roc.append(ar)

results = pd.DataFrame({'genre': genre_columns, 'AUC-PR': auc_pr, 'AUC-ROC': auc_roc})
print(results)


                     genre    AUC-PR   AUC-ROC
0                   Comedy  0.740795  0.823585
1                    Crime  0.816305  0.916625
2                    Drama  0.817795  0.848038
3                  Romance  0.641997  0.864283
4     Action and Adventure  0.826346  0.887924
5  Documentary and History  0.742287  0.930621
6     Family and Animation  0.763771  0.903410
7       Fantasy and Sci-Fi  0.727925  0.897939
8      Horror and Thriller  0.693512  0.862318


In [10]:
from sklearn.metrics import confusion_matrix, f1_score


# Calcular e imprimir matriz de confusão para cada gênero
for i, genre in enumerate(genre_columns):
    cm = confusion_matrix(test_labels[:, i], predictions_binary[:, i])
    print(f'\nMatriz de confusão para {genre}:')
    print(cm)


Matriz de confusão para Comedy:
[[671 122]
 [183 304]]

Matriz de confusão para Crime:
[[962  47]
 [ 76 195]]

Matriz de confusão para Drama:
[[520 147]
 [139 474]]

Matriz de confusão para Romance:
[[939  71]
 [122 148]]

Matriz de confusão para Action and Adventure:
[[721  96]
 [125 338]]

Matriz de confusão para Documentary and History:
[[1067   41]
 [  61  111]]

Matriz de confusão para Family and Animation:
[[944  61]
 [ 86 189]]

Matriz de confusão para Fantasy and Sci-Fi:
[[954  67]
 [ 91 168]]

Matriz de confusão para Horror and Thriller:
[[880  91]
 [119 190]]
