## Datsets : 
1. Wikipidea dataset : ```https://huggingface.co/datasets/legacy-datasets/wikipedia```
2. BookCoupus Dataset : ```http://huggingface.co/datasets/bookcorpus/bookcorpus```

### Steps to be done in the notebook
 1. Train Tokenizer
 2. Train RoBERTa
 3. Finetune RoBERTa for MLM & QA
 4. Get Accuracy metrix for MLM & QA

#### 1. Train Tokenizer

We are training the tokenizer on book-corpus dataset.

In [None]:
from tokenizers import Tokenizer,ByteLevelBPETokenizer
from tokenizers.models import BPE
from tokenizers.processors import BertProcessing
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from pathlib import Path
import os

from transformers import GPT2TokenizerFast,PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
unk_token = "<unk>"
bos_tok = "<sos>"
eos_tok = "<eos>"
pad_tok = "<pad>"
mask_tok = "<mask>"
special_tokens = [unk_token,bos_tok,eos_tok,pad_tok,mask_tok]

def prepare_sentencepiece_training(alg):
    # tokenizer = SentencePieceBPETokenizer(unk_token=unk_token)
    if alg == 'BPE':
        tokenizer = Tokenizer(model=BPE(unk_token=unk_token))
        trainer = BpeTrainer(
            special_tokens = special_tokens,
            vocab_size=30000,
            min_frequency=10,
            show_progress=True,
            max_token_length=5,
            )
    elif alg=='ByteLevelBPE':
        tokenizer = ByteLevelBPETokenizer()
        trainer =None
    tokenizer.pre_tokenizer = Whitespace()
    
    return tokenizer,trainer


In [None]:
def train_tokenizer(files,tokenizer_folder = "./tokenizer/",alg="ByteLevelBPE",type=None):
    os.makedirs(tokenizer_folder,exist_ok=True)

    tokenizer,trainer = prepare_sentencepiece_training('ByteLevelBPE')

    if alg=='BPE':
        tokenizer.train(
        files,
        trainer
        )
        tokenizer.save(os.path.join(tokenizer_folder,'vocab.json'))
        print(f"Tokenizer Type {"Tokenizer"}\nTokenizer Saved to {tokenizer_folder}")
        print(f"from tokenizers import Tokenizer\ntokenizer = Tokenizer.from_file('{os.path.join(tokenizer_folder,'vocab.json')}')")


    elif alg=='ByteLevelBPE':
        tokenizer.train(
            files,
            special_tokens = special_tokens,
            vocab_size=30000,
            min_frequency=10,
            show_progress=True,
        )
        tokenizer.save_model(tokenizer_folder)
        if type==None:
            transformer_tokenizer = PreTrainedTokenizerFast(
                    tokenizer_object=tokenizer,
                    pad_token=pad_tok,
                    bos_token=bos_tok,
                    eos_token=eos_tok,
                    unk_token=unk_token,
                    mask_tok=mask_tok,
                    padding_side="right",
                    clean_up_tokenization_spaces=False,
                    )

        elif type=='GPT2':
            transformer_tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_folder,
                                                                    pad_token=pad_tok,
                                                                    bos_token=bos_tok,
                                                                    eos_token=eos_tok,
                                                                    unk_token=unk_token,
                                                                    mask_tok=mask_tok,
                                                                    )
        transformer_tokenizer.save_pretrained(tokenizer_folder)
        print(f"Tokenizer Type '{"PreTrainedTokenizerFast" if type==None else "GPT2TokenizerFast"}'\nTokenizer Saved to {tokenizer_folder}")
        print(f"from transformers import GPT2TokenizerFast,PreTrainedTokenizerFast\ntokenizer = {"PreTrainedTokenizerFast" if type==None else "GPT2TokenizerFast"}.from_pretrained('{tokenizer_folder}')")
            

In [10]:
#Start Training
PTH = "C:\\Users\\Pankaj Deb Roy\\Documents\\DeepLearning\\Dataset\\bookcopous\\bookcorpus"
files = [os.path.join(PTH,file) for file in os.listdir(PTH)]
train_tokenizer(files=files,tokenizer_folder="tokenizer")

Tokenizer Type PreTrainedTokenizerFast
Tokenizer Saved to tokenizer
from transformers import GPT2TokenizerFast,PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer)


### 2. Training RoBERTa

In [2]:
import os
import pandas as pd
import pyarrow.parquet as pq
import csv
import codecs
import torch
import torch.nn as nn
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import GPT2TokenizerFast,PreTrainedTokenizerFast,RobertaTokenizerFast

# from transformers import Trainer, TrainingArguments, EvalPrediction
from transformers import DataCollatorForLanguageModeling
from transformers import get_linear_schedule_with_warmup
import random
import tqdm

from mlm_dataset import MLMDataset,MLMDatasetHF
from trainer import TrainingArgs,Trainer

#GPU
os.environ["CUDA_VISIBLE_DEVICES"]=""
# torch.cuda.set_device(0)
# torch.cuda.current_device()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device ',device)
#CPU
# os.environ["CUDA_VISIBLE_DEVICES"]=","
# torch.cpu.set_device(0)

  from .autonotebook import tqdm as notebook_tqdm


Device  cuda


#### Dataset creation

In [120]:
wikitext_dataset_pth="C:\\Users\\Pankaj Deb Roy\\Documents\\DeepLearning\\Dataset\\wikitext-103-raw-v1"
train_files = [file for file in os.listdir(wikitext_dataset_pth) if file.startswith('train')]
validataion_files = [file for file in os.listdir(wikitext_dataset_pth) if file.startswith('validation')]
test_files = [file for file in os.listdir(wikitext_dataset_pth) if file.startswith('test')]

In [170]:
def create_dataframe(files,csv_name):
    ENCODING = "utf-8" 
    CSV_NAME = os.path.join(wikitext_dataset_pth,csv_name)
    id = 0
    with codecs.open(CSV_NAME,  "w", ENCODING) as csvfile:
        CSVWriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
        CSVWriter.writerow(['text'])
        for file in files:
            parquet_file = pq.ParquetFile(os.path.join(wikitext_dataset_pth,file))
            for i in parquet_file.iter_batches(batch_size=1):
                
                data = i.to_pandas().iloc[0].text.split('\n')
                for line in data:
                    if len(line)<30 or line.startswith('= ='):
                        pass
                    else:
                        id+=1
                        CSVWriter.writerow([line])
    print(f'Total number of rows in {csv_name}: {id}')

In [171]:
create_dataframe(train_files,'train_wiki-104.csv')
create_dataframe(validataion_files,'val_wiki-104.csv')
create_dataframe(test_files,'test_wiki-104.csv')

Total number of rows in train_wiki-104.csv: 823942
Total number of rows in val_wiki-104.csv: 1777
Total number of rows in test_wiki-104.csv: 2007


In [None]:
import torch

In [29]:
state_dict = torch.load("./mlm_roberta/base_model_e900.pt",map_location='cpu')

In [31]:
from tqdm import trange, tqdm
from time import sleep
for i in trange(5,10):
    sleep(1)

100%|██████████| 5/5 [00:05<00:00,  1.00s/it]
