In [2]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from transformers import AutoTokenizer, DataCollatorWithPadding
from lightning import LightningDataModule
from torch.utils.data import DataLoader
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from transformers import AutoTokenizer, DataCollatorWithPadding
from lightning import LightningDataModule
from torch.utils.data import DataLoader
import pandas as pd

class RottenDataLoaderForMaskedLM(LightningDataModule):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(
            'bert-base-uncased', use_fast=True
        )
        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer, return_tensors="pt"
        )

    def prepare_data(self):
        df_scraped = pd.read_json('../../rt_scraper/results/scraped_dataset_sans_already_present_quotes.json')
        ds_scraped = Dataset.from_pandas(df_scraped)
        ds_scraped = ds_scraped.remove_columns(["__index_level_0__"])
        ds = ds_scraped.map(self.tokenize, batched=True)
        ds = ds.shuffle(seed=42)
        ds = ds.remove_columns(["text", "label", "token_type_ids"])
        return ds

    def tokenize(self, batch):
        batch_tokenized = self.tokenizer(
            batch["text"],
            padding=False,
            truncation=False,
        )
        if self.tokenizer.is_fast:
            batch_tokenized["word_ids"] = [
                batch_tokenized.word_ids(i)
                for i in range(len(batch_tokenized["input_ids"]))
            ]
        return batch_tokenized

    def split_dataset_into_equally_sized_chunks(self, batch):
        concatenated_batch = {k: sum(batch[k], []) for k in batch.keys()}
        chunks = self._split_concatenated_batch_into_equal_chunks(concatenated_batch)
        last_chunk_size = len(chunks["input_ids"][-1])
        if last_chunk_size != 128:
            chunks = self._drop_last_chunk(chunks)
        chunks["labels"] = chunks["input_ids"].copy()
        return chunks

    def _split_concatenated_batch_into_equal_chunks(self, concatenated_batch):
        total_length = len(concatenated_batch["input_ids"])
        chunks = {
            column: [
                values[i : i + 128]
                for i in range(0, total_length, 128)
            ]
            for column, values in concatenated_batch.items()
        }
        return chunks
    
    def _drop_last_chunk(self, chunks):
        return {column: values[:-1] for column, values in chunks.items()}


In [4]:
a = RottenDataLoaderForMaskedLM()

In [5]:
ds = a.prepare_data()

Map: 100%|██████████| 56236/56236 [00:02<00:00, 19782.30 examples/s]


In [6]:
len([256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 64])

130

In [7]:
c = ds.map(a.split_dataset_into_equally_sized_chunks, batched=True)

Map:   0%|          | 0/56236 [00:00<?, ? examples/s]

Map: 100%|██████████| 56236/56236 [00:10<00:00, 5367.83 examples/s]


In [8]:
c.train_test_split(train_size=0.93, seed=42)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 13392
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1009
    })
})

In [9]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    DataCollatorForLanguageModeling,
)

In [10]:
data_collator = DataCollatorForLanguageModeling(
            tokenizer=a.tokenizer,
            mlm_probability=0.2,
            return_tensors="pt",
        )

In [11]:
samples = [c[i] for i in range(5)]
for sample in samples:
    _ = sample.pop("word_ids")

In [12]:

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

NameError: name 'tokenizer' is not defined

In [None]:
[len(i) for i in {column: values[:-1] for column, values in c.items()}["attention_mask"]]

[256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256,
 256]

In [None]:
import torch
from transformers import AutoModelForMaskedLM, AdamW


class RottenTomatoesDomainAdaptationModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = AutoModelForMaskedLM.from_pretrained(
           "bert-base-uncased"
        )

    def __name__(self):
        return 'aaa'

    def forward(self, x):
        output = self.model(
            **{
                k: v
                for k, v in x.items()
                if k in ["input_ids", "attention_mask", "labels"]
            }
        )
        return output


In [None]:
model = RottenTomatoesDomainAdaptationModel()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [111]:
z = data_collator(samples)

In [114]:
tokenizer.decode(z["input_ids"][0])

'[CLS] the weakening takes a silly idea and somehow makes [MASK] work, helped [MASK] by the fact that [MASK] has got buckets of [MASK] [MASK] [SEP] [CLS] [MASK] has a rather dull [MASK], with [MASK] focus on [MASK]lip economics in the [MASK] century. [SEP] [CLS] richard jenkins is [MASK] [MASK]sm [MASK] [MASK] presence in the film. [SEP] [CLS] [MASK] animation is [MASK], the direction is limited, in all respects, by imitating [MASK] blockbusters, and the cultural [MASK] are so outdated that they [MASK] embarrassing [MASK] [ fulldal in spanish ] [SEP] [CLS] this movie was not [MASK] a movie. [SEP] [CLS] the production and attention to [MASK] here is [MASK] level [MASK].'

In [117]:
tokenizer.decode([103])

'[MASK]'

In [115]:
z["input_ids"][0]

tensor([  101,  1996, 22031,  3138,  1037, 10021,  2801,  1998,  5064,  3084,
          103,  2147,  1010,  3271,   103,  2011,  1996,  2755,  2008,   103,
         2038,  2288, 13610,  2015,  1997,   103,   103,   102,   101,   103,
         2038,  1037,  2738, 10634,   103,  1010,  2007,   103,  3579,  2006,
          103, 15000,  5543,  1999,  1996,   103,  2301,  1012,   102,   101,
         2957, 11098,  2003,   103,   103,  6491,   103,   103,  3739,  1999,
         1996,  2143,  1012,   102,   101,   103,  7284,  2003,   103,  1010,
         1996,  3257,  2003,  3132,  1010,  1999,  2035, 17475,  1010,  2011,
        10047, 16518,   103, 27858,  2015,  1010,  1998,  1996,  3451,   103,
         2024,  2061, 25963,  2008,  2027,   103, 16436,   103,  1031,  2440,
         9305,  1999,  3009,  1033,   102,   101,  2023,  3185,  2001,  2025,
          103,  1037,  3185,  1012,   102,   101,  1996,  2537,  1998,  3086,
         2000,   103,  2182,  2003,   103,  2504,   103,  1012])

In [1]:
z

NameError: name 'z' is not defined

In [118]:
x = model(z)

In [124]:
x.logits[0]

tensor([[ -6.6232,  -6.5818,  -6.5687,  ...,  -5.9188,  -5.7877,  -3.8888],
        [-10.6456, -10.4020, -10.6074,  ...,  -9.2321,  -8.5568,  -8.0614],
        [ -5.7107,  -5.6532,  -5.8286,  ...,  -4.9614,  -4.6639,  -6.0913],
        ...,
        [ -5.0434,  -5.0274,  -4.9778,  ...,  -5.1552,  -4.5794,  -2.8990],
        [ -5.0337,  -5.0674,  -5.0946,  ...,  -3.6746,  -5.0193,  -4.1733],
        [-14.1275, -13.4685, -14.0399,  ..., -10.8737, -11.4878,  -8.4017]],
       grad_fn=<SelectBackward0>)

In [130]:
z["labels"]

tensor([[ -100,  -100,  5472,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          2009,  -100,  -100,  -100,  6551,  -100,  -100,  -100,  -100,  2009,
          -100,  -100, 13610,  -100,  -100, 11084,  1012,  -100,  -100,  2009,
          -100,  -100,  2738, 10634, 18458,  -100,  -100,  1037,  -100,  -100,
         10722,  -100,  -100,  -100,  -100,  5550,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  1037,  2033,  -100, 11124,  6774,  3739,  -100,
          -100,  -100,  -100,  -100,  -100,  1996,  -100,  -100, 10551,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  3025,  -100,  -100,  -100,  -100,  -100,  -100,  7604,
          -100,  -100,  -100,  -100,  -100,  2024,  -100,  1012,  -100,  -100,
          3319,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          2130,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  6987,  -100,  -100,  2279,  -100,  

In [126]:
z['input_ids'].shape

torch.Size([5, 128])