In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

train_data = pd.read_csv('../data/preprocessed/movies_genres_grouped_train_preprocessed.csv')
test_data = pd.read_csv('../data/preprocessed/movies_genres_grouped_test_preprocessed.csv')

category_columns = train_data.columns.drop(['Name', 'Description', 'Combined'])

# Tokenizador BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Hiperparâmetros
max_len = 128
batch_size = 16
epochs = 4
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Classe Dataset personalizada
class MovieDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Combined']
        labels = self.data.iloc[idx][category_columns]  # Ajuste de acordo com os gêneros no seu conjunto de dados

        tokens = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': tokens['input_ids'][0],
            'attention_mask': tokens['attention_mask'][0],
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Função de perda de entropia cruzada binária
loss_fn = nn.BCEWithLogitsLoss()

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses)

def evaluate(model, data_loader, loss_fn, device):
    model = model.eval()
    losses = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            losses.append(loss.item())
    return np.mean(losses)

def predict(model, data_loader, device):
    model = model.eval()
    predictions = []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.sigmoid(logits).cpu().numpy())

    return np.array(predictions)

# Prepare os DataLoaders
train_dataset = MovieDataset(train_data, tokenizer, max_len)
test_dataset = MovieDataset(test_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Carregue o modelo pré-treinado e modifique a camada de saída
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3).to(device)
model.classifier = nn.Linear(model.config.hidden_size, 9).to(device)  # Ajuste '3' ao número de gêneros no seu conjunto de dados

# Otimizador e programador
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Treinamento e avaliação
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device, scheduler)
    print(f"Epoch {epoch + 1}/{epochs} | Train loss: {train_loss}")

    test_loss = evaluate(model, test_loader, loss_fn, device)
    print(f"Epoch {epoch + 1}/{epochs} | Test loss: {test_loss}")

# Faça previsões no conjunto de teste
predictions = predict(model, test_loader, device)
predictions_binary = (predictions > 0.5).astype(int)


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassificatio

Epoch 1/4 | Train loss: 0.43562499226645873
Epoch 1/4 | Test loss: 0.40666293950728427
Epoch 2/4 | Train loss: 0.340102312904833
Epoch 2/4 | Test loss: 0.3749111298425698
Epoch 3/4 | Train loss: 0.29641094534836093
Epoch 3/4 | Test loss: 0.3711471141856394
Epoch 4/4 | Train loss: 0.2696808441020952
Epoch 4/4 | Test loss: 0.3688751810494764


KeyError: "None of [Index(['genre1', 'genre2', 'genre3'], dtype='object')] are in the [columns]"

In [5]:

# Avalie o desempenho do modelo usando a métrica F1
test_labels = test_data[category_columns].values  # Ajuste de acordo com os gêneros no seu conjunto de dados
f1_macro = f1_score(test_labels, predictions_binary, average='macro')
f1_samples = f1_score(test_labels, predictions_binary, average='samples')

print(f"F1 Macro: {f1_macro}")
print(f"F1 Samples: {f1_samples}")

F1 Macro: 0.5634552851835604
F1 Samples: 0.62287865301884


In [7]:
from sklearn.metrics import classification_report

# Supondo que test_labels seja um numpy array contendo os rótulos verdadeiros do conjunto de teste
report = classification_report(test_labels, predictions_binary, target_names=category_columns, zero_division=0)
print(report)

                         precision    recall  f1-score   support

                 Comedy       0.80      0.53      0.64       502
                  Crime       0.81      0.54      0.65       259
                  Drama       0.80      0.72      0.76       757
                Romance       0.83      0.19      0.31       272
   Action and Adventure       0.79      0.58      0.67       450
Documentary and History       0.80      0.39      0.52       144
   Family and Animation       0.81      0.30      0.44       141
     Fantasy and Sci-Fi       0.66      0.42      0.51       154
    Horror and Thriller       0.62      0.52      0.57       291

              micro avg       0.77      0.53      0.63      2970
              macro avg       0.77      0.47      0.56      2970
           weighted avg       0.78      0.53      0.62      2970
            samples avg       0.78      0.57      0.62      2970



In [10]:
from sklearn.metrics import average_precision_score

# Calcular AUC-PR para cada gênero
auc_pr_scores = []

for i in range(len(category_columns)):
    auc_pr = average_precision_score(test_labels[:, i], predictions[:, i])
    auc_pr_scores.append(auc_pr)

# Imprimir AUC-PR por gênero
for genre, score in zip(category_columns, auc_pr_scores):
    print(f'AUC-PR : {genre}: {score:.4f}')

AUC-PR : Comedy: 0.7902
AUC-PR : Crime: 0.7448
AUC-PR : Drama: 0.8399
AUC-PR : Romance: 0.6160
AUC-PR : Action and Adventure: 0.7937
AUC-PR : Documentary and History: 0.6495
AUC-PR : Family and Animation: 0.6211
AUC-PR : Fantasy and Sci-Fi: 0.5707
AUC-PR : Horror and Thriller: 0.6162


In [12]:
from sklearn.metrics import confusion_matrix, f1_score


# Calcular e imprimir matriz de confusão para cada gênero
for i, genre in enumerate(category_columns):
    cm = confusion_matrix(test_labels[:, i], predictions_binary[:, i])
    print(f'\nMatriz de confusão para {genre}:')
    print(cm)


Matriz de confusão para Comedy:
[[715  67]
 [234 268]]

Matriz de confusão para Crime:
[[992  33]
 [119 140]]

Matriz de confusão para Drama:
[[389 138]
 [212 545]]

Matriz de confusão para Romance:
[[1001   11]
 [ 220   52]]

Matriz de confusão para Action and Adventure:
[[766  68]
 [187 263]]

Matriz de confusão para Documentary and History:
[[1126   14]
 [  88   56]]

Matriz de confusão para Family and Animation:
[[1133   10]
 [  98   43]]

Matriz de confusão para Fantasy and Sci-Fi:
[[1097   33]
 [  90   64]]

Matriz de confusão para Horror and Thriller:
[[898  95]
 [139 152]]
