<a href="https://colab.research.google.com/github/taoyilee/nlp_project/blob/colab/colab/Train_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup dataset from Big Query

In [0]:
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from typing import List

In [0]:
from google.colab import auth
from google.cloud import bigquery

auth.authenticate_user()
print('Authenticated')

In [0]:
%%capture
!pip install transformers

In [0]:
from transformers import GPT2DoubleHeadsModel, GPT2Tokenizer, PreTrainedTokenizer
model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'sep_token': '[SEP]','cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
!nvidia-smi

In [0]:
model.config

In [0]:
%%capture
model.to(device)

In [0]:

class BigQueryDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, project_name="focus-empire-270208", table_name="asnq.train", block_size=512):        
        print(f"Creating features from table {project_name}.{table_name}")

        self.tokenizer = tokenizer
        self.project_name = project_name
        self.table_name = table_name
        self.client = bigquery.Client(project = self.project_name)
        self.block_size=block_size

    def __len__(self):        
        QUERY = ('SELECT '
          'COUNT(*) as total_rows '
          f'FROM `{self.table_name}`')
        query_job = self.client.query(QUERY)  
        rows = query_job.result().to_dataframe()                
        return rows.loc[0, "total_rows"]

    def __getitem__(self, i):
        QUERY = ('SELECT * '          
            f'FROM `{self.table_name}` '            
            f'LIMIT 1 OFFSET {i}')        
        query_job = self.client.query(QUERY)          

        rows = query_job.result().to_dataframe().loc[0]
        x = f"{rows['question']} [SEP] {rows['context']} [CLS]"
                  
        return torch.LongTensor(self.tokenizer.encode(x, max_length=self.block_size), dtype=torch.long), rows['label']

In [0]:
from google.colab import drive

In [0]:
drive.mount("/gdrive")

In [0]:
with open('/gdrive/My Drive/UCI/06_Winter_2020/cs272_nlp/output/foo.txt', 'w') as f:
  f.write('Hello Google Drive!')

In [0]:
 max_steps = 0
 gradient_accumulation_steps = 1
 num_train_epochs=1
 weight_decay=0
 learning_rate=5e-5
 adam_epsilon=1e-8
 warmup_steps=0

In [0]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [0]:
if max_steps > 0:
        t_total = args.max_steps
        num_train_epochs = args.max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
else:
        t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

In [0]:
from tqdm import tqdm, trange

In [0]:
import random
import numpy as np
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)    
    torch.cuda.manual_seed_all(seed)

In [0]:
epochs_trained = 0
model.zero_grad()
train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch")
set_seed(0)  # Added here for reproducibility

In [0]:
print(tokenizer.bos_token)
print(tokenizer.eos_token)

In [0]:
train_batch_size = 4
def collate(examples):   
    return pad_sequence([e[0] for e in examples], batch_first=True), torch.LongTensor([e[1] for e in examples] )

train_dataset = BigQueryDataset(tokenizer,table_name="asnq.train")
dev_dataset = BigQueryDataset(tokenizer,table_name="asnq.dev")
train_sampler = RandomSampler(train_dataset) 
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size, collate_fn=collate )
for i in train_dataloader:
  print(i[0].shape, i[1].shape)
  break

In [0]:
for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, (batch, batch_mc) in enumerate(epoch_iterator):
        inputs, lm_labels, mc_labels = (batch, batch, batch_mc)
        print(inputs.shape)
        print(lm_labels.shape)
        print(mc_labels.shape)
        inputs = inputs.to(device)        
        lm_labels = lm_labels.to(device)
        mc_labels = mc_labels.to(device)
        model.train()
        outputs =  model(inputs, lm_labels=lm_labels, mc_labels=mc_labels)
        loss = outputs[0] + outputs[1] # model outputs are always tuple in transformers (see doc)
    
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps
        loss.backward()

        tr_loss += loss.item()
        if (step + 1) % args.gradient_accumulation_steps == 0:    
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

    #         if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
    #             # Log metrics
    #             if (
    #                 args.local_rank == -1 and args.evaluate_during_training
    #             ):  # Only evaluate when single GPU otherwise metrics may not average well
    #                 results = evaluate(args, model, tokenizer)
    #                 for key, value in results.items():
    #                     tb_writer.add_scalar("eval_{}".format(key), value, global_step)
    #             tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
    #             tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
    #             logging_loss = tr_loss

    #         if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
    #             checkpoint_prefix = "checkpoint"
    #             # Save model checkpoint
    #             output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
    #             os.makedirs(output_dir, exist_ok=True)
    #             model_to_save = (
    #                 model.module if hasattr(model, "module") else model
    #             )  # Take care of distributed/parallel training
    #             model_to_save.save_pretrained(output_dir)
    #             tokenizer.save_pretrained(output_dir)

    #             torch.save(args, os.path.join(output_dir, "training_args.bin"))
    #             logger.info("Saving model checkpoint to %s", output_dir)

    #             _rotate_checkpoints(args, checkpoint_prefix)

    #             torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
    #             torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
    #             logger.info("Saving optimizer and scheduler states to %s", output_dir)

    #     if args.max_steps > 0 and global_step > args.max_steps:
    #         epoch_iterator.close()
    #         break
    # if args.max_steps > 0 and global_step > args.max_steps:
    #     train_iterator.close()
    #     break