Get preprocessed 'DontPatronizeMe' dataset

In [None]:
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-gNxTZfDL0aOpzOnxE80M29dUVjSoozn' -O 'train.csv'
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-cSiEWP_NbDu7fo_7s8O5P163oKLQcBh' -O 'valid.csv'
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-13l35-18IYPFSV_36llsJbb7c4Gu2o0' -O 'test.csv'
!pip install transformers


In [1]:
import pandas as pd
import copy
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, DistilBertConfig
import torch
from torch.utils.data import DataLoader
from torch import nn, optim
import tqdm

In [2]:
if torch.cuda.is_available():
  dev = "cuda:0"
else:
  dev = "cpu"
device = torch.device(dev)
print("using device: ", dev)

using device:  cpu


In [3]:
# read pandas data
train_path = './train.csv'
valid_path = './valid.csv'
test_path = './test.csv'

train_df = pd.read_csv(train_path).dropna()
valid_df = pd.read_csv(valid_path).dropna()
test_df = pd.read_csv(test_path).dropna()

In [None]:
# download model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.save_pretrained("./tokenizers/distilbert-base-uncased-local")
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
model.save_pretrained("./models/distilbert-base-uncased-local/")

Hyperparameters

In [4]:
# hyperparameters
batch_size = 16
max_length = 256 # max text length
learning_rate = 0.01
beta_min = 0.0001
beta_max = 0.02
step_tot = 1000 # total noise adding steps
step_size = 30 # interval between adjacent noise sample
train_embedding = False # if embedding layer is trainable
x_0_prediction = False # if model predicts x_0 or x_{t-1}

Model, trainer and loss function

In [20]:
class Model(nn.Module):
  def __init__(self, train_embedding=False) -> None:
    super().__init__()

    self.model = DistilBertForMaskedLM.from_pretrained("./models/distilbert-base-uncased-local", local_files_only=True).to(device)
    self.embedding = self.model.get_input_embeddings()
    if not train_embedding:
      self.embedding.requires_grad_(False)
    self.model.set_input_embeddings(nn.Sequential())

  def get_config(self):
    return self.model.config

  def forward(self, x, mask, output_hidden_states=True):
    return self.model(x, mask, output_hidden_states=output_hidden_states)

model = Model(train_embedding=train_embedding)
trainer = optim.Adam(model.parameters(), lr=learning_rate)

model.get_config()

DistilBertConfig {
  "_name_or_path": "./models/distilbert-base-uncased-local",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.1",
  "vocab_size": 30522
}

In [21]:
betas = torch.hstack([torch.zeros(1), torch.linspace(beta_min, beta_max, step_tot)]).to(device)
alphas = 1 - betas
alpha_cumprod = torch.cumprod(alphas[:-1], 0)
def diffuse_t(x, t):
  noise = torch.normal(0, 1, x.shape).to(device)
  return torch.sqrt(alpha_cumprod[t]) * x + noise * torch.sqrt(1 - alpha_cumprod[t])

def loss(model, seq, mask, loss_func, t, t_next=-1):
  '''
  model: torch model accept x shape as input
  x: x_0
  alpha_cumprod: bar_alpha list
  loss_func: "l1" or "l2" loss function between x_0 and predicted x_0
  '''
  x_0 = model.embedding(seq)
  noised = diffuse_t(x_0, t)
  x_hat = model(noised, mask, output_hidden_states=True)[1][0]

  if t_next == -1:
    # predict x_0
    return loss_func(x_hat, x_0)
  
  # predict x_{t_next}
  x_t_next = diffuse_t(x_0, t_next)
  return loss_func(x_hat, x_t_next)


trainer = optim.Adam(model.parameters(), lr=learning_rate)


Define dataset

In [22]:
# define dataset 
class DPMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, input_df):
        self.tokenizer = tokenizer
        self.texts = input_df['text'].tolist()

    def collate_fn(self, batch):
        # function for batch allocation
        texts = []

        for b in batch:
            texts.append(b)

        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

        return {"input_ids": encodings["input_ids"].to(device), "attention_mask": encodings["attention_mask"].to(device)}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

tokenizer = DistilBertTokenizer.from_pretrained("./tokenizers/distilbert-base-uncased-local", local_files_only=True)
train_dataset = DPMDataset(tokenizer, train_df)
# train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=train_dataset.collate_fn)
train_loader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size, collate_fn=train_dataset.collate_fn)


Training

In [23]:
# training, 
model.train()
for x in tqdm.tqdm(train_loader):
  acc_loss = 0
  for t in range(1, step_tot + 1, step_size):
    trainer.zero_grad()
    l = loss(model, x["input_ids"], x["attention_mask"], nn.L1Loss(), t, max(t - 30, 0))
    l.backward()
    trainer.step()

    acc_loss += l
  print(f"average loss: {acc_loss / (step_tot // 30)}")


[Parameter containing:
tensor([[-0.0166, -0.0666, -0.0163,  ..., -0.0200, -0.0514, -0.0264],
        [-0.0132, -0.0673, -0.0161,  ..., -0.0227, -0.0554, -0.0260],
        [-0.0176, -0.0709, -0.0144,  ..., -0.0246, -0.0596, -0.0232],
        ...,
        [-0.0231, -0.0588, -0.0105,  ..., -0.0195, -0.0262, -0.0212],
        [-0.0490, -0.0561, -0.0047,  ..., -0.0107, -0.0180, -0.0219],
        [-0.0065, -0.0915, -0.0025,  ..., -0.0151, -0.0504,  0.0460]])]


  0%|          | 0/419 [00:04<?, ?it/s]

average loss: 0.012241187505424023
[Parameter containing:
tensor([[-0.0166, -0.0666, -0.0163,  ..., -0.0200, -0.0514, -0.0264],
        [-0.0132, -0.0673, -0.0161,  ..., -0.0227, -0.0554, -0.0260],
        [-0.0176, -0.0709, -0.0144,  ..., -0.0246, -0.0596, -0.0232],
        ...,
        [-0.0231, -0.0588, -0.0105,  ..., -0.0195, -0.0262, -0.0212],
        [-0.0490, -0.0561, -0.0047,  ..., -0.0107, -0.0180, -0.0219],
        [-0.0065, -0.0915, -0.0025,  ..., -0.0151, -0.0504,  0.0460]])]





In [19]:
# trial on inference
for text in train_loader:
  break
print("text fetched", text["embeddings"].shape)
noised_text = diffuse_t(text["embeddings"], 100)
print("noise added")
restored = model(noised_text, text["attention_mask"], output_hidden_states=True)
print("inference finished")
print("origin text: ", train_df.loc[0]["text"])
print("inferred: ", tokenizer.decode(torch.softmax(restored[0][0], dim=-1).argmax(dim=-1)))


text fetched torch.Size([16, 98, 768])
noise added
inference finished
origin text:  Critics have even taken to dobbing in Katrina Bungard to National Party leader Bill English when they see her sign-written car bearing her name and photo parked in disabled parks .
inferred:  .... times.... -. -..... -.. - -.. -... - - - ordinary. times times. - -. -. -. -... - times - - -...... times - - - - -. - ordinary.uary -.. times - -. - - - -... ordinary -. amor. - times ordinary - -.. ordinary -. times ordinary -. -...


In [None]:
# save model
torch.save({"net": model.to(torch.device("cpu"))}, "model.pickle")