Get preprocessed 'DontPatronizeMe' dataset

In [None]:
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-gNxTZfDL0aOpzOnxE80M29dUVjSoozn' -O 'train.csv'
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-cSiEWP_NbDu7fo_7s8O5P163oKLQcBh' -O 'valid.csv'
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-13l35-18IYPFSV_36llsJbb7c4Gu2o0' -O 'test.csv'
!pip install transformers


In [21]:
import pandas as pd
import copy
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, DistilBertConfig
import torch
from torch.utils.data import DataLoader
from torch import nn, optim
import tqdm

In [5]:
if torch.cuda.is_available():
  dev = "cuda:0"
else:
  dev = "cpu"
device = torch.device(dev)
print("using device: ", dev)

using device:  cuda:0


In [8]:
# download pandas data
train_path = './train.csv'
valid_path = './valid.csv'
test_path = './test.csv'

train_df = pd.read_csv(train_path).dropna()
valid_df = pd.read_csv(valid_path).dropna()
test_df = pd.read_csv(test_path).dropna()

In [None]:
# download model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.save_pretrained("./tokenizers/distilbert-base-uncased-local")
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
model.save_pretrained("./models/distilbert-base-uncased-local/")

Hyperparameters

In [10]:
# hyperparameters
batch_size = 16
max_length = 256 # max text length
learning_rate = 0.01
beta_min = 0.0001
beta_max = 0.02
step_tot = 1000

Model, trainer and loss function

In [11]:
# configuration = DistilBertConfig()
# print(configuration)
betas = torch.linspace(beta_min, beta_max, step_tot).to(device)
alphas = 1 - betas
alpha_cumprod = torch.cumprod(alphas[:-1], 0)
def diffuse_t(x, t):
  noise = torch.normal(0, 1, x.shape).to(device)
  return torch.sqrt(alpha_cumprod[t]) * x + noise * torch.sqrt(1 - alpha_cumprod[t])

def loss(model, x, mask, t, loss_func):
  '''
  model: torch model accept x shape as input
  x: x_0
  alpha_cumprod: bar_alpha list
  loss_func: "l1" or "l2" loss function between x_0 and predicted x_0
  '''
  noised = diffuse_t(x, t)
  x_0_hat = model(input_ids=noised, attention_mask=mask, output_hidden_states=True)[1][0]
  return loss_func(x_0_hat, x)


# Initializing a model from the configuration
model = DistilBertForMaskedLM.from_pretrained("./models/distilbert-base-uncased-local", local_files_only=True).to(device)
embedding = model.get_input_embeddings().requires_grad_(False)
model.set_input_embeddings(nn.Sequential())
print(model.config)

trainer = optim.Adam(model.parameters(), lr=learning_rate)


DistilBertConfig {
  "_name_or_path": "./models/distilbert-base-uncased-local",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.21.1",
  "vocab_size": 30522
}



Define dataset

In [16]:
# define dataset
class DPMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, input_df, embedding):
        self.tokenizer = tokenizer
        self.texts = input_df['text'].tolist()
        self.embedding = embedding

    def collate_fn(self, batch):
        # function for batch allocation
        texts = []

        for b in batch:
            texts.append(b)

        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

        return {"embeddings": self.embedding(encodings["input_ids"].to(device)), "attention_mask": encodings["attention_mask"].to(device)}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

tokenizer = DistilBertTokenizer.from_pretrained("./tokenizers/distilbert-base-uncased-local", local_files_only=True)
train_dataset = DPMDataset(tokenizer, train_df, embedding)
# train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=train_dataset.collate_fn)
train_loader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size, collate_fn=train_dataset.collate_fn)


Training

In [23]:
# training
model.train()
for x in tqdm.tqdm(train_loader):
  acc_loss = 0
  for t in range(1, step_tot + 1, 30):
    trainer.zero_grad()
    l = loss(model, x["embeddings"], x["attention_mask"], t, nn.L1Loss())
    l.backward()
    trainer.step()

    acc_loss += l
  print(f"average loss: {acc_loss / (step_tot // 30)}")


  0%|          | 1/419 [00:02<17:07,  2.46s/it]

average loss: 0.024699365720152855


  0%|          | 2/419 [00:08<31:53,  4.59s/it]

average loss: 0.01556004211306572


  1%|          | 3/419 [00:11<26:06,  3.77s/it]

average loss: 0.023339617997407913


  1%|          | 4/419 [00:14<23:07,  3.34s/it]

average loss: 0.020251471549272537


  1%|          | 5/419 [00:16<20:56,  3.04s/it]

average loss: 0.024752242490649223


  1%|▏         | 6/419 [00:20<22:53,  3.32s/it]

average loss: 0.021114397794008255


  2%|▏         | 7/419 [00:22<20:22,  2.97s/it]

average loss: 0.023092299699783325


  2%|▏         | 8/419 [00:25<20:18,  2.97s/it]

average loss: 0.02235650271177292


  2%|▏         | 9/419 [00:30<23:28,  3.44s/it]

average loss: 0.01891954056918621


  2%|▏         | 10/419 [00:32<21:26,  3.14s/it]

average loss: 0.02384861744940281


  3%|▎         | 11/419 [00:35<20:04,  2.95s/it]

average loss: 0.023961929604411125


  3%|▎         | 12/419 [00:38<21:29,  3.17s/it]

average loss: 0.021794741973280907


  3%|▎         | 13/419 [00:43<25:39,  3.79s/it]

average loss: 0.016351614147424698


  3%|▎         | 14/419 [00:47<24:08,  3.58s/it]

average loss: 0.01932598277926445


  4%|▎         | 15/419 [00:50<24:39,  3.66s/it]

average loss: 0.019169514998793602


  4%|▍         | 16/419 [00:54<24:28,  3.64s/it]

average loss: 0.018846530467271805


  4%|▍         | 17/419 [00:59<26:27,  3.95s/it]

average loss: 0.018185701221227646


  4%|▍         | 18/419 [01:01<22:16,  3.33s/it]

average loss: 0.027218306437134743


  5%|▍         | 19/419 [01:05<24:45,  3.71s/it]

average loss: 0.019912676885724068


  5%|▍         | 19/419 [01:07<23:39,  3.55s/it]


KeyboardInterrupt: ignored

In [19]:
# trial on inference
for text in train_loader:
  break
print("text fetched", text["embeddings"].shape)
noised_text = diffuse_t(text["embeddings"], 100)
print("noise added")
restored = model(noised_text, text["attention_mask"], output_hidden_states=True)
print("inference finished")
print("origin text: ", train_df.loc[0]["text"])
print("inferred: ", tokenizer.decode(torch.softmax(restored[0][0], dim=-1).argmax(dim=-1)))


text fetched torch.Size([16, 98, 768])
noise added
inference finished
origin text:  Critics have even taken to dobbing in Katrina Bungard to National Party leader Bill English when they see her sign-written car bearing her name and photo parked in disabled parks .
inferred:  .... times.... -. -..... -.. - -.. -... - - - ordinary. times times. - -. -. -. -... - times - - -...... times - - - - -. - ordinary.uary -.. times - -. - - - -... ordinary -. amor. - times ordinary - -.. ordinary -. times ordinary -. -...


In [None]:
# save model
torch.save({"net": model.to(torch.device("cpu"))}, "model.pickle")