Get preprocessed 'DontPatronizeMe' dataset

In [None]:
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-gNxTZfDL0aOpzOnxE80M29dUVjSoozn' -O 'train.csv'
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-cSiEWP_NbDu7fo_7s8O5P163oKLQcBh' -O 'valid.csv'
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1-13l35-18IYPFSV_36llsJbb7c4Gu2o0' -O 'test.csv'


In [17]:
import pandas as pd
import copy
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, DistilBertConfig
import torch
from torch.utils.data import DataLoader
from torch import nn
from denoising_diffusion_pytorch import GaussianDiffusion

In [2]:
# download pandas data
train_path = './train.csv'
valid_path = './valid.csv'
test_path = './test.csv'

train_df = pd.read_csv(train_path).dropna()
valid_df = pd.read_csv(valid_path).dropna()
test_df = pd.read_csv(test_path).dropna()

In [3]:
# hyperparameters
batch_size = 16
max_length = 256 # max text length

In [11]:
# define dataset TODO: use distilbert embedding as sample, check how nlp use output
class DPMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, input_df):
        self.tokenizer = tokenizer
        self.texts = input_df['text'].tolist()

    def collate_fn(self, batch):
        # function for batch allocation
        texts = []

        for b in batch:
            texts.append(b)

        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

        encodings['text'] = texts
        return encodings

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

tokenizer = DistilBertTokenizer.from_pretrained("./models/distilbert-base-uncased-local", local_files_only=True)
train_dataset = DPMDataset(tokenizer, train_df)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=train_dataset.collate_fn)
print(tokenizer.vocab_size)

30522


In [18]:
# configuration = DistilBertConfig()
# print(configuration)

# Initializing a model from the configuration
model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
print(model.config)


DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.1",
  "vocab_size": 30522
}



In [20]:
# TODO: change embedding layer to identity one, input should be already embedded 
model()

TypeError: 'DistilBertForMaskedLM' object is not subscriptable

In [6]:
def diffuse_t(x, alpha_cumprod, t):
  noise = torch.normal(0, 1, x.shape)
  return torch.sqrt(alpha_cumprod[t]) * x + noise * torch.sqrt(1 - alpha_cumprod[t])

def loss(model, x, alpha_cumprod, t, loss_func):
  '''
  model: torch model accept x shape as input
  x: x_0
  alpha_cumprod: bar_alpha list
  loss_func: "l1" or "l2" loss function between x_0 and predicted x_0
  '''
  noised = diffuse_t(x, alpha_cumprod, t)
  return loss_func(noised, x)
  