link to tuto: https://medium.com/data-and-beyond/complete-guide-to-building-bert-model-from-sratch-3e6562228891

In [1]:
import os
from pathlib import Path
import torch
import re
import random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam


In [2]:
MAX_LEN = 64
dataset_path =  'D://data_phd_code_from_scatch//datasets//cornell movie-dialogs corpus'
### loading all data into memory
corpus_movie_conv = dataset_path +'/movie_conversations.txt'
corpus_movie_lines = dataset_path +'/movie_lines.txt'
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as c:
    conv = c.readlines()
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as l:
    lines = l.readlines()

In [3]:
### splitting text using special lines
lines_dic = {}
for line in lines:
    objects = line.split(" +++$+++ ")
    lines_dic[objects[0]] = objects[-1]

### generate question answer pairs
pairs = []
for con in conv:
    ids = eval(con.split(" +++$+++ ")[-1])
    for i in range(len(ids)):
        qa_pairs = []
        
        if i == len(ids) - 1:
            break

        first = lines_dic[ids[i]].strip()  
        second = lines_dic[ids[i+1]].strip() 

        qa_pairs.append(' '.join(first.split()[:MAX_LEN]))
        qa_pairs.append(' '.join(second.split()[:MAX_LEN]))
        pairs.append(qa_pairs)

# train tokenizer

In [None]:
# WordPiece tokenization
### save data as txt file

text_data = []
file_count = 0
save_path_data_processed = 'D:/data_phd_code_from_scatch/datasets/data_processed/movie-dialogs'
for sample in tqdm.tqdm([x[0] for x in pairs]):
    text_data.append(sample)
    # once we hit the 10K mark, save to file
    if len(text_data) == 10000:
        with open(save_path_data_processed + '/text_'+ str(file_count) +'.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

In [None]:
# Specify the directory path

# List to hold the paths of .txt files
txt_files = []

# Walk through the directory
for root, dirs, files in os.walk(save_path_data_processed):
    for file in files:
        if file.endswith('.txt'):
            txt_files.append(os.path.join(root, file))

In [None]:
import tokenizer_train
tokenizer = tokenizer_train.train_bert_tokenizer(txt_files, 'bert_toy')

# start

In [4]:
from transformers import BertTokenizer
vocab_file_path = 'bert_tokenizer/bert_toy-vocab.txt'
tokenizer = BertTokenizer(vocab_file_path)

In [None]:
encoded_text = tokenizer.encode("Hello, how are you?", add_special_tokens=True)
decoded_text = tokenizer.decode(encoded_text)

print(f"Encoded Text: {encoded_text}")
print(f"Decoded Text: {decoded_text}")

# load dataset

In [5]:
from dataset_prapare import BERTDataset
train_data = BERTDataset(pairs, tokenizer, seq_len=64)
train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)

In [8]:
sample_data = next(iter(train_loader))
print(train_data[random.randrange(len(train_data))])

{'bert_input': tensor([    1,     3,     3,     3,     3,   417,     3,     3,   146, 10230,
            3,   153,   300,   179,    17,    17,    17,     2,   182,    34,
           17,    17,    17,     2,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'bert_label': tensor([   0,  255,   11,   58,  220,    0,  172,  491,    0,  230, 1231,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]), 'segment_label': tensor([1, 1, 

# pretrain

In [17]:
from models import BERTEmbedding
d_model=768
print(sample_data['bert_input'].shape)
bert_embed = BERTEmbedding(tokenizer.vocab_size, d_model, MAX_LEN, dropout = 0.1)
bert_embed(sample_data['bert_input'], sample_data['segment_label']).shape

torch.Size([32, 64])


torch.Size([32, 64, 768])

In [16]:
sample_data

{'bert_input': tensor([[    1,  3902,    17,  ...,     0,     0,     0],
         [    1,   302,   234,  ...,     0,     0,     0],
         [    1,     3,   308,  ...,    15,   395,    17],
         ...,
         [    1,   194,   255,  ...,     0,     0,     0],
         [    1,   514,   153,  ...,     0,     0,     0],
         [    1, 19782,   107,  ...,     0,     0,     0]]),
 'bert_label': tensor([[  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0, 248,   0,  ...,   0,   0,   0],
         ...,
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0]]),
 'segment_label': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 2, 2, 2],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'is_next': tensor([0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,