<a href="https://colab.research.google.com/github/yonatanrtt/transformers_layers/blob/main/superglue_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -Uqq transformers datasets wandb --quiet

In [None]:
from datasets import load_dataset, concatenate_datasets
import wandb
import pandas as pd

In [None]:
def get_datasets():
    dataset = "super_glue"
    copa = load_dataset(dataset, "copa")
    cb = load_dataset(dataset, "cb")
    rte = load_dataset(dataset, "rte")
    wic = load_dataset(dataset, "wic")
    return copa, cb, rte, wic

In [None]:
import torch

class CopaDataset(torch.utils.data.Dataset):
    def __init__(self, _data, _tokenizer):
        self.premise = list(_data["_premise"].astype("str"))
        self.choice1 = list(_data["_choice1"].astype("str")) 
        self.choice2 = list(_data["_choice2"].astype("str"))  
        self.question = list(_data["_question"].astype("str"))   
        self.label = list(_data["_label"].astype("int"))  
        self.tokenizer = _tokenizer

    def __getitem__(self, _idx):
        connector = "because" if self.question[_idx] == "cause" else "so"
        answers = [self._choice1[_idx], self._choice2[_idx]]

        positive_answers_idx = self._label[_idx]
        negative_answers_idx = 1 - positive_answers_idx

        positive = connector.join((self.premise[_idx]), answers[positive_answers_idx])
        negative = connector.join((self.premise[_idx]), answers[negative_answers_idx])

        positive_tokenized = self.tokenizer.encode(positive, padding="max_length", truncation=True, return_tensors="pt")
        negative_tokenized = self.tokenizer.encode(negative, padding="max_length", truncation=True, return_tensors="pt")


        return positive_tokenized, negative_tokenized

    def __len__(self):
        return len(self.label)