In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import pandas as pd
import numpy as np
import torch.optim as optim
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.nn.parallel import DataParallel


class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=16384):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.loc[idx, "text"]
        kind = self.data.loc[idx, "kind"]
        score = self.data.loc[idx, "score"]

        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        input_ids = encoding["input_ids"][0]
        attention_mask = encoding["attention_mask"][0]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "kind": torch.tensor(kind, dtype=torch.long),
            "score": torch.tensor(
                score if not np.isnan(score) else 0.0, dtype=torch.float
            ),
        }


class MultiTaskXLMRoberta(nn.Module):
    def __init__(self, cache_dir=None):
        super(MultiTaskXLMRoberta, self).__init__()
        self.xlm_roberta = XLMRobertaModel.from_pretrained(
            "xlm-roberta-base", cache_dir=cache_dir
        )
        self.classification_head = nn.Linear(self.xlm_roberta.config.hidden_size, 2)
        self.regression_head = nn.Linear(self.xlm_roberta.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlm_roberta(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0]
        pooled_output = last_hidden_state[:, 0]

        classification_output = self.classification_head(pooled_output)
        regression_output = self.regression_head(pooled_output)
        return classification_output, regression_output.squeeze(-1)


def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    classification_loss_fn = nn.CrossEntropyLoss()
    regression_loss_fn = nn.MSELoss()

    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        kind = batch["kind"].to(device)
        score = batch["score"].to(device)

        kind_preds, score_preds = model(input_ids, attention_mask)

        classification_loss = classification_loss_fn(kind_preds, kind)
        regression_loss = regression_loss_fn(score_preds[kind == 1], score[kind == 1])

        loss = classification_loss + regression_loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(train_loader)

In [4]:
# Hyperparameters
epochs = 5
batch_size = 64
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache_dir = (
    "/home/xuyijie/news-title-bias/notebooks/06.Multi-task_Learning/cache_pretrained"
)

In [5]:
# Load the data
data = pd.read_csv(
    "/home/xuyijie/news-title-bias/data/dataset/dataset_combined_multitask.csv",
    names=["text", "kind", "score"],
    header=0,
)
data["score"].fillna(0, inplace=True)
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", cache_dir=cache_dir)
dataset = NewsDataset(data, tokenizer)

In [6]:
# Load the data
data = pd.read_csv(
    "/home/xuyijie/news-title-bias/data/dataset/dataset_combined_multitask.csv",
    names=["text", "kind", "score"],
    header=0,
)
data["score"].fillna(0, inplace=True)
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base", cache_dir=cache_dir)
dataset = NewsDataset(data, tokenizer)

# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)
train_dataset = NewsDataset(train_data, tokenizer)
val_dataset = NewsDataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [13]:
len(train_data.iloc[1106].text)

3793

In [15]:
tokenizer(
            data['text'],
            return_tensors="pt",
            max_length=512,
            padding="max_length",
            truncation=True,
        )

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [24]:
train_data.loc[5]

text      is a nonprofit newsroom that investigates abu...
kind                                                     1
score                                                 -2.0
Name: 5, dtype: object