<a href="https://colab.research.google.com/github/vpysmennyi/machinelearning-learning/blob/main/course_gpt2/Untitled20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.8-cp37-cp37m-linux_x86_64.whl
!pip install transformers

In [4]:
import os
import time
import datetime
import json

import pandas as pd
import numpy as np
import random

import torch
from torch import nn

import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils

from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, random_split

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import AdamW, get_linear_schedule_with_warmup

# set to True when running in google colab
in_colab = True


class ArxivDataset(Dataset):
    def __init__(self, data_list, tokenizer, max_length=768):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attention_msk = []

        for dat in data_list:
            tokenizer_encodings = tokenizer(dat, truncation=True, max_length=max_length, padding='max_length')

            self.input_ids.append(torch.tensor(tokenizer_encodings['input_ids']))
            self.attention_msk.append(torch.tensor(tokenizer_encodings['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_msk[idx]


class ArxivAbstractGen():
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        
        self.dataset = ArxivDataset(abstracts, tokenizer, max_length=config['encoding_max_length'])

        self.epochs = config['num_epochs']
        self.learning_rate = config['lr'] 
        self.warmup_steps = config['warmup_steps']
        self.epsilon = config['epsilon']
        self.batch_size = config['train_batch_size']

        self.train_ds_len = 0

    def get_model(self):
        return self.model

    def prepare_data_loader(self, ds):
        def _get_distributed_sampler(sliced_ds, shuffle=True):
            sampler = DistributedSampler(
                sliced_ds,
                num_replicas=xm.xrt_world_size(),
                rank=xm.get_ordinal(),
                shuffle=shuffle)
            return sampler

        train_size = int(len(ds) * config['train_size_percent'] / 100)
        val_size = len(ds) - train_size

        train_ds, val_ds = random_split(ds, [train_size, val_size])

        self.train_ds_len = len(train_ds)

        train_dl = DataLoader(train_ds,
                              sampler=_get_distributed_sampler(train_ds, shuffle=True),
                              batch_size=self.batch_size)
        val_dl = DataLoader(val_ds,
                            sampler=_get_distributed_sampler(val_ds, shuffle=False),
                            batch_size=self.batch_size)
        
        return train_dl, val_dl

    def model_save(self):
        xm.master_print('saving the model')
        self.model.to('cpu')
        model_to_save = self.model.module if hasattr(self.model,
                                                'module') else self.model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(config['save_dir'])
        tokenizer.save_pretrained(config['save_dir'])
        xm.master_print('saved')

    def print_train_stats(self, stats):
        pd.set_option('precision', 2)
        stat = pd.DataFrame(data=stats)

        stat = stat.set_index('epoch')
        xm.master_print(stat)

    def generate_sample(self, top_k=50, max_l=200, top_p=0.92, num_seq=1):
        xm.master_print('Generating sample:')
        xm.rendezvous('generate smpl')
        # moving to cpu for generation
        self.model.to('cpu')
        sample_output = self.model.generate(bos_token_id=random.randint(1, 30000),
                                       do_sample=True,
                                       top_k=top_k,
                                       max_length=max_l,
                                       top_p=top_p,
                                       num_return_sequences=num_seq)
        for i, sample in enumerate(sample_output):
            xm.master_print(f'{i} : {self.tokenizer.decode(sample, skip_special_tokens=True)}')
        
    def run_training(self, tpu_dev):

        # get data loader sets
        train_dl, val_dl = self.prepare_data_loader(self.dataset)

        self.model.to(tpu_dev)

        opt = AdamW(self.model.parameters(), 
                    lr=self.learning_rate * xm.xrt_world_size(), 
                    eps=self.epsilon)

        total_steps = int(self.train_ds_len / self.batch_size / xm.xrt_world_size() * self.epochs)

        scheduler = get_linear_schedule_with_warmup(opt, 
                                                    num_warmup_steps=self.warmup_steps, 
                                                    num_training_steps=total_steps)

        t0 = time.time()
        xm.master_print(f'num_train_steps = {total_steps}, TPU cores={xm.xrt_world_size()}')

        train_stats = []

        for epoch in range(self.epochs):
            t0_epoch = time.time()
            
            p_train_dl = pl.ParallelLoader(train_dl, [tpu_dev])
            p_val_dl = pl.ParallelLoader(val_dl, [tpu_dev])

            
            #devices = xm.get_xla_supported_devices(max_devices=config['num_tpu_cores'])
            #model = dp.DataParallel(self.model, device_ids=devices)

            val_epoch_loss = []
            epoch_loss = []

            ############ TRAINING
            self.model.train()
            for step, batch in enumerate(p_train_dl.per_device_loader(tpu_dev)):

                opt.zero_grad()

                b_input_ids = batch[0].to(tpu_dev)
                b_labels = batch[0].to(tpu_dev)
                b_att_mask = batch[1].to(tpu_dev)

                output = self.model(b_input_ids,
                               labels=b_labels,
                               attention_mask=b_att_mask,
                               token_type_ids=None)

                loss = output[0]
                epoch_loss.append(loss.item())

                #if step % 10 == 0:
                #    xm.master_print(f'Step: {step}')

                '''if config['mid_sample_enable']:
                    if step % config['sample_every_steps'] == 0 and step > 0:
                        model.eval()

                        generate_sample(model, tokenizer, config['decode_top_k'], config['decode_max_length'],
                                        config['decode_top_p'])
                        # moving back to tpu to proceed with training
                        model.to(tpu_dev)
                        model.train()'''

                loss.backward()

                # need to use this for parallelism
                xm.optimizer_step(opt)
                
                scheduler.step()

            train_time = format_time(time.time() - t0_epoch)
            epoch_loss = np.array(epoch_loss).mean()

            if epoch_loss:
                xm.master_print(f'TRAIN | Epoch {epoch + 1} : Loss = {epoch_loss}. Elapsed: {train_time}')
            else:
                print('Epoch loss is empty')

            ############ VALIDATION
            self.model.eval()

            val_t0 = time.time()

            for val_step, batch in enumerate(p_val_dl.per_device_loader(tpu_dev)):
                #xm.master_print(f'Val step {val_step}')
                b_input_ids = batch[0].to(tpu_dev)
                b_labels = batch[0].to(tpu_dev)
                b_att_mask = batch[1].to(tpu_dev)

                output = self.model(b_input_ids,
                               labels=b_labels,
                               attention_mask=b_att_mask,
                               token_type_ids=None)

                loss = output[0]
                val_epoch_loss.append(loss.item())

            val_time = format_time(time.time() - val_t0)
            val_epoch_loss = np.array(val_epoch_loss).mean()

            if val_epoch_loss:
                xm.master_print(f'VAL | Epoch {epoch + 1} : Loss = {val_epoch_loss}. Elapsed: {val_time}')
            else:
                print('Empty val_epoch_loss')

            # Record all statistics from this epoch.
            train_stats.append(
                {
                    'epoch': epoch + 1,
                    'Training Loss': epoch_loss,
                    'Valid. Loss': val_epoch_loss,
                    'Training Time': train_time,
                    'Validation Time': val_time
                }
            )

            xm.save(self.model.state_dict(), config['save_dir'] + 'model_e.pt')

        xm.master_print(f'Total elapsed: {format_time(time.time() - t0)}')
        self.print_train_stats(train_stats)
        xm.rendezvous('leave')

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round(elapsed))))

def _mp_fn(rank, flags, trainer_obj):
    torch.set_default_tensor_type('torch.FloatTensor')

    # initializing TPU device
    device = xm.xla_device()
    xm.rendezvous('init')
    trainer_obj.run_training(device)

# reading config
conf_file = './config.json'

if in_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    conf_file = 'drive/MyDrive/ArxivDS/colab_config.json'

with open(conf_file) as f:
    config = json.load(f)

print('Execution configuration:')
for c in config:
    print(f'{c}' + ' '*(30-len(c)) + f'{config[c]}')

#os.environ["XRT_TPU_CONFIG"] = "tpu_worker;0;" + config['tpu_ip_address'] + ":8470"

#reading dataset
df = pd.read_json(config['datafile'], lines=True, nrows=80000)
abstracts = df.abstract

#loading GPT2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<BOS>', eos_token='<EOS>', pad_token='<PAD>')
gpt2_config = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
mdl = GPT2LMHeadModel.from_pretrained('gpt2', config=gpt2_config)
mdl.resize_token_embeddings(len(tokenizer))

trainer = ArxivAbstractGen(mdl, tokenizer)

xmp.spawn(_mp_fn, args=(config, trainer), nprocs=config['num_tpu_cores'], start_method='fork')

trainer.model_save()

#generating samples with a tuned model
trainer.generate_sample(top_k=config['decode_top_k'], 
                        max_l=config['decode_max_length'], 
                        top_p=config['decode_top_p'],
                        num_seq=config['decode_num_test_samples'])

#export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Execution configuration:
encoding_max_length           256
datafile                      drive/MyDrive/ArxivDS/arxiv-abstracts.json
save_dir                      drive/MyDrive/ArxivDS/model_save/
num_tpu_cores                 1
decode_top_k                  50
decode_max_length             200
decode_top_p                  0.92
decode_num_test_samples       3
train_batch_size              2
train_size_percent            95
num_epochs                    5
lr                            5e-05
warmup_steps                  100
epsilon                       1e-08
mid_sample_enable             False
sample_every_steps            100


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


num_train_steps = 190000, TPU cores=1
TRAIN | Epoch 1 : Loss = 2.160737574042066. Elapsed: 1:31:42
VAL | Epoch 1 : Loss = 1.9624112636968494. Elapsed: 0:01:57
TRAIN | Epoch 2 : Loss = 1.9302689365407355. Elapsed: 1:30:48
VAL | Epoch 2 : Loss = 1.9046180317923427. Elapsed: 0:01:59
TRAIN | Epoch 3 : Loss = 1.841948399081434. Elapsed: 1:34:02
VAL | Epoch 3 : Loss = 1.8786731933131815. Elapsed: 0:02:00
TRAIN | Epoch 4 : Loss = 1.7830171864111173. Elapsed: 1:30:36
VAL | Epoch 4 : Loss = 1.8648985799849034. Elapsed: 0:01:58
TRAIN | Epoch 5 : Loss = 1.743164057972792. Elapsed: 1:30:17
VAL | Epoch 5 : Loss = 1.8573311190456152. Elapsed: 0:01:58
Total elapsed: 7:47:55
       Training Loss  Valid. Loss Training Time Validation Time
epoch                                                          
1               2.16         1.96       1:31:42         0:01:57
2               1.93         1.90       1:30:48         0:01:59
3               1.84         1.88       1:34:02         0:02:00
4           

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


saved
Generating sample:
0 :  planet   This is a comment on a review article by D. Cazenave, P. Geitel, J. A. Kulkarni, A. J. Shor and B. I. Bohm.  enger's papers ``Statistical analysis of the electromagnetic energy in a nonrelativistic plasma'' [arXiv:0808.4517]  enger's comments ``The case for the existence of Lorentz-invariant and Lorentz-invariant energy scales for the total electromagnetic energy in relativistic plasmas'' [arXiv:0808.5377]  enger's comments ``Nonrelativistic plasma models of the electromagnetic energy spectrum: implications of new physics models''
1 :  planet   We describe the recent development of the 3D magnetohydrodynamics code HADES for studying the evolution of magnetically-driven, rotating accretion disks. The code calculates stellar magnetospheres that are in a spin-down state, rotating rapidly, and which are highly magnetically-driven (R = 10^9 K and 3 degrees). As a first application, we calculate the vertical and radial distributions of accreting disks, 

In [None]:
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Execution configuration:
encoding_max_length           256
datafile                      drive/MyDrive/ArxivDS/arxiv-abstracts.json
save_dir                      drive/MyDrive/ArxivDS/model_save/
num_tpu_cores                 1
decode_top_k                  50
decode_max_length             200
decode_top_p                  0.92
decode_num_test_samples       3
train_batch_size              2
train_size_percent            95
num_epochs                    5
lr                            5e-05
warmup_steps                  100
epsilon                       1e-08
mid_sample_enable             False
sample_every_steps            100
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
num_train_steps = 190000, TPU cores=1
TRAIN | Epoch 1 : Loss = 2.160737574042066. Elapsed: 1:31:42
VAL | Epoch 1 : Loss = 1.9624112636968494. Elapsed: 0:01:57
TRAIN | Epoch 2 : Loss = 1.9302689365407355. Elapsed: 1:30:48
VAL | Epoch 2 : Loss = 1.9046180317923427. Elapsed: 0:01:59
TRAIN | Epoch 3 : Loss = 1.841948399081434. Elapsed: 1:34:02
VAL | Epoch 3 : Loss = 1.8786731933131815. Elapsed: 0:02:00
TRAIN | Epoch 4 : Loss = 1.7830171864111173. Elapsed: 1:30:36
VAL | Epoch 4 : Loss = 1.8648985799849034. Elapsed: 0:01:58
TRAIN | Epoch 5 : Loss = 1.743164057972792. Elapsed: 1:30:17
VAL | Epoch 5 : Loss = 1.8573311190456152. Elapsed: 0:01:58
Total elapsed: 7:47:55
       Training Loss  Valid. Loss Training Time Validation Time
epoch                                                          
1               2.16         1.96       1:31:42         0:01:57
2               1.93         1.90       1:30:48         0:01:59
3               1.84         1.88       1:34:02         0:02:00
4               1.78         1.86       1:30:36         0:01:58
5               1.74         1.86       1:30:17         0:01:58
saving the model
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
saved
Generating sample:
0 :  planet   This is a comment on a review article by D. Cazenave, P. Geitel, J. A. Kulkarni, A. J. Shor and B. I. Bohm.  enger's papers ``Statistical analysis of the electromagnetic energy in a nonrelativistic plasma'' [arXiv:0808.4517]  enger's comments ``The case for the existence of Lorentz-invariant and Lorentz-invariant energy scales for the total electromagnetic energy in relativistic plasmas'' [arXiv:0808.5377]  enger's comments ``Nonrelativistic plasma models of the electromagnetic energy spectrum: implications of new physics models''
1 :  planet   We describe the recent development of the 3D magnetohydrodynamics code HADES for studying the evolution of magnetically-driven, rotating accretion disks. The code calculates stellar magnetospheres that are in a spin-down state, rotating rapidly, and which are highly magnetically-driven (R = 10^9 K and 3 degrees). As a first application, we calculate the vertical and radial distributions of accreting disks, in the presence of their spin-down or spin-up conditions. We show that at the stellar poles the vertical and radial distributions of accretion disks are dominated by disk-jet components. Thus the horizontal and vertical distributions of these accreting disks follow a different kinematical and structural phase than those of normal accreting disks. The disk-jet components contribute substantially to the rotational magnetization, and this effect can be enhanced when the disk inclination is angle-resolved. We also show how to modify the disk dynamics to
2 :  planet   Theoretical predictions of the neutrino mass hierarchy for the first few neutrino oscillation period after leptogenesis are presented. Numerical simulations of an isotropic (0.1,0.1) neutrino oscillation with a fixed number of particles and a fixed effective temperature of 0.01 $\mu$B are performed. The numerical results show that if the initial neutrino mass hierarchy for the first one year is of order $\mu$B, the second one year should be much larger than the first one year. If the initial neutrino mass hierarchy has a constant value $\mu_{13}=2\nu_\mu^2$, then the neutrino mass hierarchy must be of order $\mu_{13}^{+2}$ for some range of $\mu \ll 1$ and $\mu \ll 6$ times larger than $\mu_{11}=1.3 \nu_\mu^2