<a href="https://colab.research.google.com/github/zamandalee/lm-moral-preferences/blob/main/BATCHED_cs2592n_T5_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## T5 Preprocessing and Training with Pytorch

## 1. Install libraries

In [None]:
!pip install transformers==2.9.0



In [None]:
# Check we have a GPU and check the memory size of the GUP
!nvidia-smi

Tue May 10 04:08:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P8    33W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 2. Prepare Model

In [None]:

import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(42)

In [None]:
 !torch. __version__

/bin/bash: torch.: command not found


In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')


In [None]:
# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in t5_model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in t5_model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)



## 3. Prepare Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# data_dir = ""
# data_dir = "/content/drive/MyDrive/moral_stories_datasets 2/generation/action|context/norm_distance/"
data_dir = "/content/drive/MyDrive/moral_stories_datasets/generation/action|context/norm_distance/"
# data_dir = "/content/drive/My Drive/_CS2952N Advanced Topics in DL/cs2952n Final Project/moral_stories_datasets/generation/action|context/norm_distance/"
train_data_dir = data_dir + "train.jsonl"
test_data_dir = data_dir + "test.jsonl"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load in the data
import json

og_train_data, og_test_data = [], []

for obj in open(train_data_dir, 'r'):
    og_train_data.append(json.loads(obj))
for obj in open(test_data_dir, 'r'):
    og_test_data.append(json.loads(obj))

og_train_data = list(filter(lambda x: x['label'] == '1', og_train_data))
og_test_data = list(filter(lambda x: x['label'] == '1', og_test_data))

print("Train, test len: ", len(og_train_data), len(og_test_data))
print("Example OG data: \n", og_train_data[0])

FileNotFoundError: ignored

In [None]:
def preprocess(data, encoder_max_len=250, decoder_max_len=54):

    input_ids = []
    masks = []
    lm_labels = []
    decoder_masks = []

    question_pluses = []
    answers_pluses = []

    for i, example in enumerate(data):
        # For dataset_type == ACTION (action|context data)
        # Format intention, norm, situation, and action
        question = example['intention']
        norm = example['norm']
        context = example['situation']
        answer = example['moral_action']
      
        # into question and answer
        question_plus = f"answer_me: {str(question)}"
        question_plus += f" norm: {str(norm)}"
        question_plus += f" context: {str(context)} </s>"
        answer_plus = f"{answer} </s>"

        question_pluses.append(question_plus)
        answers_pluses.append(answer_plus)

        # Tokenize
        encoder_inputs = tokenizer.encode_plus(
            question_plus, max_length=encoder_max_len, 
            pad_to_max_length=True, return_tensors="pt"
        )
        decoder_inputs = tokenizer.encode_plus(
            answer_plus, max_length=decoder_max_len, 
            pad_to_max_length=True, return_tensors="pt"
        )

        input_ids.append(encoder_inputs["input_ids"])
        masks.append(encoder_inputs["attention_mask"])
        lm_labels.append(decoder_inputs["input_ids"])
        decoder_masks.append(decoder_inputs["attention_mask"])
    
    return (input_ids, masks, lm_labels, decoder_masks, question_pluses, answers_pluses)

In [None]:
train_input_ids, train_masks, train_lm_labels, train_decoder_masks, train_question_pluses, train_answers_pluses = preprocess(og_train_data)
test_input_ids, test_masks, test_lm_labels, test_decoder_masks, test_question_pluses, test_answers_pluses = preprocess(og_test_data)

print("Example mapped data: \n", train_input_ids[0], "\n", train_masks[0], "\n", train_lm_labels[0], "\n", train_decoder_masks[0], "\n", train_question_pluses[0], "\n", train_answers_pluses[0])
# print("Example2 mapped data: \n", train_data[1])
# print("Example3 mapped data: \n", train_data[2])

## 3. Training Loop

In [None]:
# Train the model
# t5_model.train()

# epochs = 1
# batch_size = 30
# num_total_examples = 600

# for i, start_idx in enumerate(range(0, num_total_examples, batch_size)):
#     # Batch
#     end_idx = start_idx + batch_size
#     batch_train_input_ids = train_input_ids[start_idx:end_idx]
#     batch_train_masks = train_masks[start_idx:end_idx]
#     batch_train_lm_labels = train_lm_labels[start_idx:end_idx]
#     batch_train_decoder_masks = train_decoder_masks[start_idx:end_idx]
#     print(batch_train_input_ids)
#     print()
#     # Forward function automatically creates decoder_input_ids
#     output = t5_model(input_ids=torch.cat(batch_train_input_ids), lm_labels=torch.cat(batch_train_lm_labels),
#                       attention_mask=torch.cat(batch_train_masks),
#                       decoder_attention_mask=torch.cat(batch_train_decoder_masks))
#     loss = output[0]
#     loss.backward()
#     optimizer.step()
#     optimizer.zero_grad()

#     if (i % 100 == 0):
#       print("Example ", i, " ✅")

  # print ("Epoch ", epoch, " ✅")

## 4. Test model (THIS I HAVEN'T UPDATED YET - need to test accuracy on `test_data`)

### 3 epochs 200 examples

In [None]:
# TRY THIS!!! @WILL @MASON

# test_input_ids, test_masks, test_lm_labels, test_decoder_masks, test_question_pluses, test_answers_pluses = preprocess(og_test_data)

# test_ex = test_data[0]
i = 0
print("Test 1: \n", test_question_pluses[i], "\n", test_answers_pluses[i])

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids[i],
    attention_mask=test_masks[i],
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=3,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,
                            clean_up_tokenization_spaces=True)
    print(sent)

### 1 epoch 600 examples

In [None]:
# Train the model
t5_model2 = T5ForConditionalGeneration.from_pretrained('t5-base')
t5_model2.train()

epochs = 1
batch_size = 50
inputs = train_data[0:600]

for i, start_idx in enumerate(range(0, len(inputs), batch_size)):
  # Batch
  end_idx = start_idx + batch_size
  batch_inputs = inputs[start_idx:end_idx]


  input_ids = list(map(x['input_ids'] ))
  lm_labels = list(map(x['lm_labels'] for x in batch_inputs))
  attention_mask = list(map(x['attention_mask'] for x in batch_inputs))
  decoder_attention_mask = list(map(x['decoder_attention_mask'] for x in batch_inputs))
                                          
  # Forward function automatically creates decoder_input_ids
  output = t5_model2(input_ids=input_ids, lm_labels=lm_labels,
                    attention_mask=attention_mask,
                    decoder_attention_mask=decoder_attention_mask)
  loss = output[0]
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()

  if (i % 100 == 0):
    print("Example ", i, " ✅")

# print ("Epoch ", epoch, " ✅")

test_ex = test_data[0]
print("Test 1: \n", test_ex['question_plus'], "\n", test_ex['answer_plus'])

t5_model2.eval()
beam_outputs = t5_model2.generate(
    input_ids=test_ex['input_ids'],
    attention_mask=test_ex['attention_mask'],
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=3,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,
                            clean_up_tokenization_spaces=True)
    print(sent)

In [None]:
test_sent = 'falsify: The sailor was happy and joyful. </s>'
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids,attention_mask=test_attention_mask,
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=3,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)

In [None]:
test_sent = 'falsify: This is a safe neighbourhood. </s>'
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids,attention_mask=test_attention_mask,
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=3,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)

In [None]:
test_sent = 'falsify: The tortoise was very slow. </s>'
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids,attention_mask=test_attention_mask,
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=3,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)