# Get preprocessed 'DontPatronizeMe' dataset

In [1]:
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-gNxTZfDL0aOpzOnxE80M29dUVjSoozn' -O 'train.csv'
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-cSiEWP_NbDu7fo_7s8O5P163oKLQcBh' -O 'valid.csv'
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-13l35-18IYPFSV_36llsJbb7c4Gu2o0' -O 'test.csv'
!pip install transformers


--2022-08-19 15:04:57--  https://drive.google.com/uc?export=download&id=1-gNxTZfDL0aOpzOnxE80M29dUVjSoozn
Resolving drive.google.com (drive.google.com)... 173.194.203.100, 173.194.203.138, 173.194.203.139, ...
Connecting to drive.google.com (drive.google.com)|173.194.203.100|:443... connected.
HTTP request sent, awaiting response... ^C
--2022-08-19 15:04:58--  https://drive.google.com/uc?export=download&id=1-cSiEWP_NbDu7fo_7s8O5P163oKLQcBh
Resolving drive.google.com (drive.google.com)... 173.194.203.100, 173.194.203.138, 173.194.203.139, ...
Connecting to drive.google.com (drive.google.com)|173.194.203.100|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-10-9s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/j1lhliqq1hhsvanps0a2iu24hvtg09f7/1660921425000/09836793732558009118/*/1-cSiEWP_NbDu7fo_7s8O5P163oKLQcBh?e=download&uuid=639aab9b-351c-4fdf-b62c-61c6ea811213 [following]
--2022-08-19 15:04:59--  https://doc-10-9s

# Import package, model, dataset

In [2]:
import pandas as pd
import copy
from transformers import (
    DistilBertTokenizer, DistilBertForMaskedLM, DistilBertConfig,
)
import torch
from torch.utils.data import DataLoader
from torch import nn, optim
import tqdm

ModuleNotFoundError: ignored

In [None]:
if torch.cuda.is_available():
  dev = "cuda:0"
else:
  dev = "cpu"
device = torch.device(dev)
print("using device: ", dev)

In [None]:
# read pandas data
train_path = './train.csv'
valid_path = './valid.csv'
test_path = './test.csv'

train_df = pd.read_csv(train_path).dropna()
valid_df = pd.read_csv(valid_path).dropna()
test_df = pd.read_csv(test_path).dropna()

In [None]:
# download model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.save_pretrained("./tokenizers/distilbert-base-uncased-local")
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
model.save_pretrained("./models/distilbert-base-uncased-local/")

# Hyperparameters

In [12]:
# hyperparameters
batch_size = 16
max_length = 64 # max text length
learning_rate = 1e-4
epoch_num = 10
beta_min = 0.0001
beta_max = 0.02
step_tot = 2000 # total noise adding steps
sample_size = 1 # number of sample steps in each diffuse sequence
train_embedding = False # if embedding layer is trainable
x_0_prediction = False # if model predicts x_0 or x_{t-1}

# Model, trainer and loss function

In [7]:
from torch.nn.modules import activation
class DistilBertModel(nn.Module):
  def __init__(self, train_embedding=False) -> None:
    super().__init__()

    self.model = DistilBertForMaskedLM.from_pretrained("./models/distilbert-base-uncased-local", local_files_only=True).to(device)
    self.embedding = self.model.get_input_embeddings()
    if not train_embedding:
      self.embedding.requires_grad_(False)
    self.model.set_input_embeddings(nn.Sequential())

  def get_config(self):
    return self.model.config

  def forward(self, x, mask, output_hidden_states=True):
    return self.model(x, mask, output_hidden_states=output_hidden_states)

class EncoderModel(nn.Module):
  def __init__(self, 
               layer_dim=512, 
               nhead=8, 
               activation='gelu',
               dropout=0.1,
               num_layer=6,
               train_embedding=False) -> None:
    super().__init__()

    encoder_layer = nn.TransformerEncoderLayer(
        d_model=layer_dim, 
        nhead=nhead,
        dim_feedforward=2048,
        activation=activation,
        dropout=dropout,
        batch_first=False,
        norm_first=False,
        device=device)
    self.model = nn.TransformerEncoder(
        encoder_layer, 
        num_layers=num_layer,
        norm=None,
        enable_nested_tensor=False)
    self.embedding = nn.Embedding(
        30522,
        layer_dim, 
        padding_idx=None, 
        max_norm=None, 
        norm_type=2.0, 
        scale_grad_by_freq=False, 
        sparse=False, 
        device=device)
    
    def forward(self, x, mask):
      return self.model(x, mask)


# model = DistilBertModel(train_embedding=train_embedding)
model = EncoderModel(train_embedding=train_embedding)
trainer = optim.Adam(model.parameters(), lr=learning_rate)


DistilBertConfig {
  "_name_or_path": "./models/distilbert-base-uncased-local",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.1",
  "vocab_size": 30522
}

In [8]:
betas = torch.hstack([torch.zeros(1), torch.linspace(beta_min, beta_max, step_tot)]).to(device)
alphas = 1 - betas
alpha_cumprod = torch.cumprod(alphas[:-1], 0)
def diffuse_t(x, t):
  '''
  x_shape: [batch_size, seq_len, dim]
  t shape: [sample num]

  return shape [batch_size * sample_num, seq_len, dim]
  '''
  batch_size, seq_len, dim = x.shape
  sample_shape = (sample_size, *(1, ) * len(x.shape))

  noise = torch.normal(0, 1, x.shape).to(device)
  mean = torch.sqrt(alpha_cumprod[t].reshape(sample_shape)) * x 
  epsilon = noise * torch.sqrt(1 - alpha_cumprod[t]).reshape(sample_shape)
  return (mean + epsilon).reshape((sample_size * batch_size, seq_len, dim))

def generate_diffuse_pair(x_0, repeat_shape, t, t_next=-1):
  '''
  x_0 shape: [batch_size, seq_len, dim]
  t shape: [sample_num]
  repeat shape: (sample_num, 1, 1, ...)
  
  return (net input, net target)
    shape [batch_size * sample_num, seq_len, dim]
  '''
  if t_next == -1:
    # predict x_0
    return (diffuse_t(x_0, t), x_0.repeat(repeat_shape))

  # predict x_{t_next}
  return (diffuse_t(x_0, t), diffuse_t(x_0, t_next))

def loss(model, x_input, x_tgt, mask, loss_func):
  x_hat = model(x_input, mask, output_hidden_states=True)[1][0]
  return loss_func(x_hat, x_tgt)

trainer = optim.Adam(model.parameters(), lr=learning_rate)


# Define dataset

In [9]:
# define dataset 
class DPMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, input_df):
        self.tokenizer = tokenizer
        self.texts = input_df['text'].tolist()

    def collate_fn(self, batch):
        # function for batch allocation
        texts = []

        for b in batch:
            texts.append(b)

        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

        return {"input_ids": encodings["input_ids"].to(device), "attention_mask": encodings["attention_mask"].to(device)}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

tokenizer = DistilBertTokenizer.from_pretrained("./tokenizers/distilbert-base-uncased-local", local_files_only=True)
train_dataset = DPMDataset(tokenizer, train_df)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=train_dataset.collate_fn)


# Training

In [13]:
# training, 
model.train()
for epoch in range(epoch_num):
  acc_loss = 0
  for x in tqdm.tqdm(train_loader):
    x_0 = model.embedding(x["input_ids"])
    repeat_shape = (sample_size, *(1, ) * (len(x_0.shape) - 1))
    t = torch.randint(0, step_tot, repeat_shape, device=device)
    x_input, x_tgt = generate_diffuse_pair(x_0, repeat_shape, t)

    trainer.zero_grad()
    l = loss(model, x_input, x_tgt, x["attention_mask"].repeat(repeat_shape), nn.L1Loss())
    l.backward()
    trainer.step()

    acc_loss += l

  print(f"epoch {epoch} average loss: {acc_loss / (step_tot // 30)}") # TODO: change loss 


100%|██████████| 419/419 [00:40<00:00, 10.44it/s]


epoch 0 average loss: 0.2694462537765503


100%|██████████| 419/419 [00:40<00:00, 10.46it/s]


epoch 1 average loss: 0.5360242128372192


100%|██████████| 419/419 [00:39<00:00, 10.55it/s]


epoch 2 average loss: 0.8006168603897095


100%|██████████| 419/419 [00:39<00:00, 10.49it/s]


epoch 3 average loss: 1.0622565746307373


100%|██████████| 419/419 [00:39<00:00, 10.50it/s]


epoch 4 average loss: 1.3215457201004028


100%|██████████| 419/419 [00:39<00:00, 10.48it/s]


epoch 5 average loss: 1.5792444944381714


100%|██████████| 419/419 [00:39<00:00, 10.49it/s]


epoch 6 average loss: 1.8352642059326172


100%|██████████| 419/419 [00:39<00:00, 10.51it/s]


epoch 7 average loss: 2.0897789001464844


100%|██████████| 419/419 [00:39<00:00, 10.50it/s]


epoch 8 average loss: 2.3427443504333496


100%|██████████| 419/419 [00:39<00:00, 10.49it/s]


epoch 9 average loss: 2.5948922634124756


In [None]:
x["attention_mask"].shape

torch.Size([16, 98])

In [None]:
# trial on inference
for text in train_loader:
  break
repeat_shape = (sample_size, *(1, ) * (len(x_0.shape) - 1))
t = torch.randint(0, step_tot, repeat_shape, device=device)
noised_text = diffuse_t(model.embedding(text["input_ids"]), t)
print("noise added")
restored = model(noised_text, text["attention_mask"].repeat(repeat_shape), output_hidden_states=True)
print("inference finished")
print("origin text: ", train_df.loc[0]["text"])
print("inferred: ", tokenizer.decode(torch.softmax(restored[0][0], dim=-1).argmax(dim=-1)))


noise added
inference finished
origin text:  Critics have even taken to dobbing in Katrina Bungard to National Party leader Bill English when they see her sign-written car bearing her name and photo parked in disabled parks .
inferred:  .................. -... ~... - ~............ -......................... ~............ ~............... -....


In [None]:
text

{'input_ids': tensor([[  101,  4401,  2031,  ...,     0,     0,     0],
         [  101, 13573,  1998,  ...,     0,     0,     0],
         [  101,  2720,  8716,  ...,     0,     0,     0],
         ...,
         [  101,  1000,  2012,  ...,     0,     0,     0],
         [  101,  1996, 20871,  ...,     0,     0,     0],
         [  101,  1000,  1000,  ...,     0,     0,     0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

In [None]:
# save model
torch.save({"net": model.to(torch.device("cpu"))}, "model.pickle")