In [1]:
try:
    # This library is our indicator that the required installs
    # need to be done.
    import datasets
except ModuleNotFoundError:
    !git clone https://github.com/wabecoff/NLU/
    !pip install -r NLU/requirements.txt
    import sys
    sys.path.append("NLU")

Cloning into 'NLU'...
remote: Enumerating objects: 2247, done.[K
remote: Counting objects: 100% (155/155), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 2247 (delta 72), reused 118 (delta 71), pack-reused 2092[K
Receiving objects: 100% (2247/2247), 41.50 MiB | 16.14 MiB/s, done.
Resolving deltas: 100% (1371/1371), done.
Collecting jupyter>=1.0.0 (from -r NLU/requirements.txt (line 7))
  Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting torch==1.13.1 (from -r NLU/requirements.txt (line 9))
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.14.1 (from -r NLU/requirements.txt (line 10))
  Downloading torchvision-0.14.1-cp310-cp310-manylinux1_x86_64.whl (24.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.2/24.2 MB[0m [31m49.3 MB/s[0m eta [36m

In [None]:
import torch.nn as nn
import torch
import numpy as n
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch_model_base import TorchModelBase
from transformers import EncoderDecoderModel
from torch.nn import CrossEntropyLoss

In [None]:
data = pd.read_csv('/content/even_more_labels.csv')

In [None]:
# Longer inputs will lead to memory issues
# At inference we can
data = data[data['Unmasked'].str.len() <= 1100]

In [None]:
src, tgt = data.Unmasked, data.Masked

In [None]:
class RecogsDataset(torch.utils.data.Dataset):
    def __init__(self, enc_tokenizer, dec_tokenizer, X, y=None):
        self.X = [enc_tokenizer.encode(s) for s in X]
        self.y = y
        if y is not None:
            self.y = [dec_tokenizer.encode(s) for s in y]

    @staticmethod
    def collate_fn(batch):
        """Unfortunately, we can't pass the tokenizer in as an argument
        to this method, since it is a static method, so we need to do
        the work of creating the necessary attention masks."""
        def get_pad_and_mask(vals):
            lens = [len(i) for i in vals]
            maxlen = max(lens)
            pad = []
            mask = []
            for ex, length in zip(vals, lens):
                diff = maxlen - length
                pad.append(ex + ([0] * diff))
                mask.append(([1] * length) + ([0] * diff))
            return torch.tensor(pad), torch.tensor(mask)
        batch_elements = list(zip(*batch))
        X = batch_elements[0]
        X_pad, X_mask = get_pad_and_mask(X)
        if len(batch_elements) == 1:
            return X_pad, X_mask
        else:
            y = batch_elements[1]
            y_pad, y_mask = get_pad_and_mask(y)
            # Repeat `y_pad` because our optimizer expects to find
            # labels in final position. These will not be used because
            # Hugging Face will calculate the loss for us.
            return X_pad, X_mask, y_pad, y_mask, y_pad

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is None:
            return (self.X[idx],)
        else:
            return (self.X[idx], self.y[idx])

class RecogsLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.reduction = "mean"

    def forward(self, outputs, labels):
        """`labels` is ignored, as it was already used to assign a
        value of `outputs.loss`, and that value is all we need."""
        return outputs.loss

In [None]:
class T5RecogsModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.encdec = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

    def forward(self, X_pad, X_mask, y_pad, y_mask, labels=None):
        outputs = self.encdec(
            input_ids=X_pad,
            attention_mask=X_mask,
            decoder_attention_mask=y_mask,
            labels=y_pad)
        return outputs

class T5RecogsModel(TorchModelBase):
    def __init__(self, *args, initialize=True, **kwargs):
        super().__init__(*args, **kwargs)
        self.enc_tokenizer = AutoTokenizer.from_pretrained("t5-large")
        self.dec_tokenizer = self.enc_tokenizer
        self.loss = RecogsLoss()
        self.exists = False

    def build_graph(self):
        return T5RecogsModule()

    def build_dataset(self, X, y=None):
        return RecogsDataset(
            self.enc_tokenizer, self.dec_tokenizer, X, y=y)

    def predict(self, X, device=None):
        device = self.device if device is None else torch.device(device)
        dataset = self.build_dataset(X)
        dataloader = self._build_dataloader(dataset, shuffle=False)
        self.model.to(device)
        self.model.eval()
        preds = []
        with torch.no_grad():
            for batch in dataloader:
                X_pad, X_mask = [x.to(device) for x in batch]
                outputs = self.model.encdec.generate(
                    X_pad,
                    attention_mask=X_mask,
                    max_new_tokens=512,
                    eos_token_id=self.model.encdec.config.eos_token_id)
                results = self.dec_tokenizer.batch_decode(
                    outputs,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False)
                preds += results
        return preds


    def fit(self, *args):
        """
        Generic optimization method.
        Parameters
        ----------
        *args: list of objects
            We assume that the final element of args give the labels
            and all the preceding elements give the system inputs.
            For regular supervised learning, this is like (X, y), but
            we allow for models that might use multiple data structures
            for their inputs.
        Attributes
        ----------
        model: nn.Module or subclass thereof
            Set by `build_graph`. If `warm_start=True`, then this is
            initialized only by the first call to `fit`.
        optimizer: torch.optimizer.Optimizer
            Set by `build_optimizer`. If `warm_start=True`, then this is
            initialized only by the first call to `fit`.
        errors: list of float
            List of errors. If `warm_start=True`, then this is
            initialized only by the first call to `fit`. Thus, where
            `max_iter=5`, if we call `fit` twice with `warm_start=True`,
            then `errors` will end up with 10 floats in it.
        validation_scores: list
            List of scores. This is filled only if `early_stopping=True`.
            If `warm_start=True`, then this is initialized only by the
            first call to `fit`. Thus, where `max_iter=5`, if we call
            `fit` twice with `warm_start=True`, then `validation_scores`
            will end up with 10 floats in it.
        no_improvement_count: int
            Used to control early stopping and convergence. These values
            are controlled by `_update_no_improvement_count_early_stopping`
            or `_update_no_improvement_count_errors`.  If `warm_start=True`,
            then this is initialized only by the first call to `fit`. Thus,
            in that situation, the values could accumulate across calls to
            `fit`.
        best_error: float
           Used to control convergence. Smaller is assumed to be better.
           If `warm_start=True`, then this is initialized only by the first
           call to `fit`. It will be reset by
           `_update_no_improvement_count_errors` depending on how the
           optimization is proceeding.
        best_score: float
           Used to control early stopping. If `warm_start=True`, then this
           is initialized only by the first call to `fit`. It will be reset
           by `_update_no_improvement_count_early_stopping` depending on how
           the optimization is proceeding. Important: we currently assume
           that larger scores are better. As a result, we will not get the
           correct results for, e.g., a scoring function based in
           `mean_squared_error`. See `self.score` for additional details.
        best_parameters: dict
            This is a PyTorch state dict. It is used if and only if
            `early_stopping=True`. In that case, it is updated whenever
            `best_score` is improved numerically. If the early stopping
            criteria are met, then `self.model` is reset to contain these
            parameters before `fit` exits.
        Returns
        -------
        self
        """
        #if self.early_stopping:
        #    args, dev = self._build_validation_split(
        #        *args, validation_fraction=self.validation_fraction)

        # Dataset:
        dataset = self.build_dataset(*args)
        dataloader = self._build_dataloader(dataset, shuffle=self.shuffle_train)

        # Set up parameters needed to use the model. This is a separate
        # function to support using pretrained models for prediction,
        # where it might not be desirable to call `fit`.

        # (william) changed source code to not re-initialize w every fit
        # allows more flexibility in calling fit and evaluating
        if not self.exists:
          self.initialize()

        # Make sure the model is where we want it:
        self.model.to(self.device)

        self.model.train()
        self.optimizer.zero_grad()

        for iteration in range(1, self.max_iter+1):

            epoch_error = 0.0

            for batch_num, batch in enumerate(dataloader, start=1):

                batch = [x.to(self.device) for x in batch]


                X_batch = batch[: -1]
                y_batch = batch[-1]

                batch_preds = self.model(*X_batch)


                err = self.loss(batch_preds, y_batch)


                if self.gradient_accumulation_steps > 1 and \
                  self.loss.reduction == "mean":
                    err /= self.gradient_accumulation_steps

                err.backward()

                epoch_error += err.item()

                if batch_num % self.gradient_accumulation_steps == 0 or \
                  batch_num == len(dataloader):
                    if self.max_grad_norm is not None:
                        torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), self.max_grad_norm)
                    self.optimizer.step()
                    self.optimizer.zero_grad()

        print(epoch_error)

        return self

    def initialize(self):
        """
        Method called by `fit` to establish core attributes. To use a
        pretrained model without calling `fit`, one can use this
        method.
        """
        if not self.warm_start or not hasattr(self, "model"):
            self.model = self.build_graph()
            # This device move has to happen before the optimizer is built:
            # https://pytorch.org/docs/master/optim.html#constructing-it
            self.model.to(self.device)
            self.optimizer = self.build_optimizer()
            self.errors = []
            self.validation_scores = []
            self.no_improvement_count = 0
            self.best_error = np.inf
            self.best_score = -np.inf
            self.best_parameters = None
        self.exists = True

In [None]:
t5mod = T5RecogsModel()

(…)ace.co/t5-large/resolve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

(…)ce.co/t5-large/resolve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

(…).co/t5-large/resolve/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
t5mod.device = device
# params chosen to not run into cuda memory issues - trained on colab pro
t5mod.max_iter = 2
t5mod.batch_size = 4
t5mod.eta = 0.0001
meta_batch_size = 1500
epochs = 150

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(np.array(src), np.array(tgt), test_size=0.01, random_state=42)

In [None]:
t5mod.model.train()
for i in range(epochs):
  torch.cuda.empty_cache()
  sampled_indices = np.random.choice(len(X_train), meta_batch_size, replace=False)
  X_, y_ = X_train[sampled_indices].copy(), y_train[sampled_indices].copy()
  t5mod.fit(X_, y_)

5.874425394918944
5.395003323275887
4.9227513497680775
4.579181485703884
5.379898926563328
5.347840218575584
5.07857193269956
4.985483060067054
4.922974491335481
4.706941810549324
4.221775900405191
4.786837336503595
3.9659269769545062
4.791292342686575
3.9495467233318777
4.590899238119164
4.306113931139407
4.006084522232413
3.535183412889637
4.0061907892618365
3.691274698699999
3.792027078702631
3.994108163686178
3.8500969029573753
3.4887651160552196
3.597188960184212
3.820908145777139
3.6586402200227894
3.228719833836294


KeyboardInterrupt: ignored

In [None]:
some_str = ['Charlie, Marnie and , Charlie Siang was his name, Siang born -- 12/21/2002, and Marnie Martindale (June 22, 1995) Martindale and Mr. Godfried the baker.  Godfried was not related to Marnie Martindale']

In [None]:
t5mod.model.eval()
with torch.no_grad():
  pred = t5mod.predict(some_str)

In [None]:
pred

['[[FIRST_NAME_1]], [[FIRST_NAME_2]] and , [[FULL_NAME_1]] was his name, [[FIRST_NAME_1]] born -- [[DOB]], and [[FULL_NAME_2]] [[FIRST_NAME_3]] ([[DOB]]) [[FIRST_NAME_2]] and Mr. [[LAST_NAME_3]] the baker. Godfried was not related to [[FULL_NAME_2]]']

In [None]:
with torch.no_grad():
  preds = t5mod.predict(X_test)

num_exact_match, num_total = 0, 0
for i, pred  in enumerate(preds):
  if pred == y_test[i]:
    num_exact_match += 1
  num_total += 1

num_exact_match/num_total

In [None]:
torch.save(t5mod.model, 't5mask_improved_5.pt')