## Assess various language models

In [14]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import copy
import openai

from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer 
from transformers import TrainingArguments, Trainer, Seq2SeqTrainer, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq

In [15]:
# Set device type
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'

In [16]:
# Read OpenAI API key
with open('openai_key.txt') as f:
    lines = f.readlines()
openai_key = lines[0]
openai.api_key = openai_key # Set API key

In [18]:
prompt_context='hello world'

In [19]:
# LLM query - OpenAI
response = openai.Completion.create(
    model='text-davinci-002',
    prompt=prompt_context,
    temperature=0,
    max_tokens=300,
    top_p=1,
    n=3,
    frequency_penalty=0,
    presence_penalty=0
)

agent_response = response['choices'][0]['text']

print(agent_response)



Hello, world! My name is John.


### Load datasets

In [3]:
base_dir = os.path.dirname(os.path.realpath('__file__'))
data_dir = os.path.join(base_dir, 'data', 'text_question_answer')

print('Loading question and answer data...')
with open (os.path.join(data_dir, 'train-squad-v2.0.json'), 'r') as f:
    train_qna_raw = json.loads(f.read())


print('Place data into dataframe object...')
# Processing SQUAD data
df_train = None
ids = []
contexts = []
questions = []
answers = []
is_impossible = []
for il1, l1 in enumerate(train_qna_raw['data']):
    for il2, l2 in enumerate(l1['paragraphs']):
        for il3, l3 in enumerate(l2['qas']):
            ids.append(l3['id'])
            contexts.append(l2['context'])
            questions.append(l3['question'])
            try:
                answers.append(l3['answers'][0]['text'])
            except:
                answers.append('')
            is_impossible.append(l3['is_impossible'])
            
df_train = pd.DataFrame({'id': ids, 'context': contexts, 
                         'question': questions, 'answer': answers, 'is_impossible': is_impossible})

Loading question and answer data...
Place data into dataframe object...


In [4]:
df_train.head()

Unnamed: 0,id,context,question,answer,is_impossible
0,56be85543aeaaa14008c9063,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,False
1,56be85543aeaaa14008c9065,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,False
2,56be85543aeaaa14008c9066,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,False
3,56bf6b0f3aeaaa14008c9601,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",False
4,56bf6b0f3aeaaa14008c9602,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,False


### Text generation (casual language model)

In [67]:
# HuggingFace model
class HuggingFaceModelPack():
    '''
    Generic model pack class for HuggingFace Hub models
    '''
    # Initialize class variables
    def __init__(self, model, tokenizer, input_block_size, padding_length=100):
        self.padding_length = padding_length
        self.tokenizer = tokenizer
        self.model = model
        self.input_block_size = input_block_size
        self.train_default_args = ['title', 'num_train_epochs', 'optimizer', 'mlm', 
                                   'per_device_train_batch_size', 'per_device_eval_batch_size',
                                   'warmup_steps', 'weight_decay', 'logging_steps', 
                                   'output_dir', 'logging_dir', 'save_model']
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.model.config.eos_token_id
    
    # Embed text
    def embedding(self, text):
        token_ids = self.tokenizer.encode(text, return_tensors='pt')
        
        # Get embeddings
        if 'flan' in self.model.name_or_path: 
            emb = self.model.shared.weight[token_ids[0]] # embeddings for FLAN
        elif 'gpt2' in self.model.name_or_path: 
            emb = self.model.transformer.wte.weight[token_ids[0]] # embeddings for GPT2
            
        emb /= emb.norm(dim=1).unsqueeze(1) # normalise embedding weights
        emb_pad = torch.zeros(self.padding_length, emb.shape[1]) # Set and apply padding to embeddings
        emb_pad[:emb.shape[0], :] = emb
        
        return emb_pad
    
    # Generate text
    def predict(self, text, max_length=100, skip_special_tokens=True, display_probability=False):
        output_context = {
            'text': None,
            'probability': None,
        }
        
        input_ids = self.tokenizer.encode(text, return_tensors='pt')
        output = self.model.generate(input_ids, 
                                     max_length=max_length,
                                     pad_token_id=self.model.config.eos_token_id,
                                     num_return_sequences=1, 
                                     output_scores=display_probability, 
                                     return_dict_in_generate=display_probability, 
                                     renormalize_logits=display_probability)
                                          
        

        # Decode the output sequence
        if display_probability:
            output_context['text'] = self.tokenizer.decode(output['sequences'][0], skip_special_tokens=skip_special_tokens)
            output_context['probability'] = [
                {'token': self.tokenizer.decode(torch.argmax(nn.Softmax(dim=-1)(s)).item()), 
                 'probability': torch.max(nn.Softmax(dim=-1)(s)).item()} for s in output['scores']
            ]
        else:
            output_context['text'] = self.tokenizer.decode(output[0], skip_special_tokens=skip_special_tokens)
        output_context['text'] = output_context['text'].replace('\n', '')
        output_context['text'] = output_context['text'].strip()
        
        return output_context
    
    # Tokenize function
    def _tokenize_function(self, examples):
        return self.tokenizer(examples['text'])
    
    # Tokenize pandas dataframe feature
    def tokenize_text(self, x, batched=False):  
        df_sample = pd.DataFrame({'text': x})
        hf_dataset = Dataset.from_pandas(df_sample)
        if batched:
            tokenized_dataset = hf_dataset.map(self._tokenize_function, batched=batched, num_proc=4)
        else:
            tokenized_dataset = hf_dataset.map(self._tokenize_function)

        return tokenized_dataset
    
    # Model training
    def fit(self, x, y, train_args={}, instruct=False):
        # Convert to tokens format from pandas dataframes
        tokenized_data = self.tokenize_text(x)
        tokenized_target = self.tokenize_text(y)
        
        # Check for missing input arguments
        assert set(list(train_args.keys())) == set(self.train_default_args), \
        f'Train args are not in the required format - missing: {", ".join(list(set(self.train_default_args) - set(list(train_args.keys()))))}'
        
        if instruct:
            print('Setting up training in sequence to sequence format...')
            tokenized_data = tokenized_data.add_column('labels', tokenized_target['input_ids']) # Create target sequence labels
            data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer, model=self.model) # Organise data for training
            
            # Setup training in sequence to sequence format
            training_args = TrainingArguments(
                optim=train_args['optimizer'], # model optimisation function
                num_train_epochs=train_args['num_train_epochs'], # total number of training epochs
                per_device_train_batch_size=train_args['per_device_train_batch_size'],  # batch size per device during training
                per_device_eval_batch_size=train_args['per_device_eval_batch_size'],   # batch size for evaluation
                warmup_steps=train_args['warmup_steps'], # number of warmup steps for learning rate scheduler
                weight_decay=train_args['weight_decay'], # strength of weight decay
                logging_steps=train_args['logging_steps'],
                output_dir='./results', # output directory
                logging_dir=train_args['logging_dir'], # log directory
            )

            trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=tokenized_data.remove_columns(['text']),
                eval_dataset=tokenized_data.remove_columns(['text']),
                data_collator=data_collator,
            )
    
        else:
            print('Setting up training in autoregressive format...')
            data_collator = DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm=train_args['mlm']) # Organise data for training
            
            # Setup training in autoregressive format
            training_args = TrainingArguments(
                optim=train_args['optimizer'], 
                num_train_epochs=train_args['num_train_epochs'], 
                per_device_train_batch_size=train_args['per_device_train_batch_size'],
                per_device_eval_batch_size=train_args['per_device_eval_batch_size'], 
                warmup_steps=train_args['warmup_steps'], 
                weight_decay=train_args['weight_decay'], 
                logging_steps=train_args['logging_steps'],
                output_dir=train_args['output_dir'],
                logging_dir=train_args['logging_dir'], 
            )
            
            trainer = Seq2SeqTrainer(
                model=self.model,
                args=training_args,
                train_dataset=tokenized_data.remove_columns(['text']),
                eval_dataset=tokenized_data.remove_columns(['text']),
                data_collator=data_collator,
            )

        trainer.train() # Execute training
        
        if train_args['save_model']:
            trainer.save_model(f'./results/model_{train_args["title"]}') # Save trained model
    
# OpenAI model
class OpenAIModelPack():
    '''
    OpenAI model class
    '''
    def __init__(self, model, api_key):
        self.model = model
        openai.key = api_key
    
    # Generate text
    def predict(self, text):
        output_context = {
            'text': None,
            'probability': None,
        }
        response = openai.Completion.create(
            model=self.model,
            prompt=text,
            temperature=0,
            max_tokens=300,
            top_p=1,
            n=3,
            frequency_penalty=0,
            presence_penalty=0
        )

        output_context['text'] = response['choices'][0]['text']

        return output_context
    
                     
class ModelPack():
    '''
    Main model pack class
    '''
    def __init__(self, 
                 model,
                 tokenizer=None, 
                 input_block_size=10, 
                 padding_length=100, 
                 source='huggingface', 
                 api_key=None):
        
        self.padding_length = padding_length
        self.tokenizer = tokenizer
        self.model = model
        self.input_block_size = input_block_size
        self.source = source
        self.api_key = api_key
        
        # Accepted models from sources
        self.accepted_models = {
            'huggingface': [
                'distilgpt2', 
                'google/flan-t5-base'
            ],
            'openai': [
                'text-davinci-002', 
                'text-davinci-003'
            ],
        }
        
        # HuggingFace model call
        if self.source == 'huggingface':
            assert self.tokenizer is not None, 'tokenizer required for HuggingFace model'
            self.instance = HuggingFaceModelPack(self.model, 
                                                 self.tokenizer, 
                                                 self.input_block_size, 
                                                 self.padding_length)
        # OpenAI model call
        elif self.source == 'openai':
            assert self.api_key is not None, 'api key has not been specified'
            assert self.model in self.accepted_models['openai'], 'model name is not found in accepted openai models: ' + ' '.join(self.accepted_models['openai'])
            self.instance = OpenAIModelPack(model=self.model, api_key=self.api_key)
                
    # Direct to the attribute ofthe sub model pack class (attribute not found in the main model pack class)
    def __getattr__(self, name):
        return self.instance.__getattribute__(name)
    
    
# Calculate cosine similarity of two matrices    
def cosine_similarity(m1, m2):
    return F.cosine_similarity(m1.view(1, -1), m2.view(1, -1))

In [68]:
lm_gpt3 = ModelPack(model='text-davinci-002', source='openai', api_key=openai_key)

In [69]:
lm_gpt3.predict('hello world')

{'text': '\n\nHello, world! My name is John.', 'probability': None}

In [26]:
# GPT2 small
tokenizer_dgpt2 = AutoTokenizer.from_pretrained('distilgpt2', mirror='https://huggingface.co')
model_dgpt2 = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Create HuggingFace LM from base HuggingFace model and tokenizer objects
model_dgpt2_train=copy.deepcopy(model_dgpt2)
lm_gpt2 = ModelPack(model_dgpt2, tokenizer_dgpt2, input_block_size=10, padding_length=100, source='huggingface')
lm_gpt2_train = ModelPack(model_dgpt2_train, tokenizer_dgpt2, input_block_size=10, padding_length=100, source='huggingface')

Using pad_token, but it is not set yet.


In [59]:
output = lm_gpt2.predict('hello world is', max_length=30, display_probability=True)
output['text']

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}



'hello world is a place where people can live and work together, and where people can live and work together, and where people can live and work together'

In [60]:
output['probability']

[{'token': ' a', 'probability': 0.05274742469191551},
 {'token': ' place', 'probability': 0.04598025605082512},
 {'token': ' where', 'probability': 0.4814596474170685},
 {'token': ' people', 'probability': 0.27657586336135864},
 {'token': ' can', 'probability': 0.2809840440750122},
 {'token': ' live', 'probability': 0.06638554483652115},
 {'token': ' and', 'probability': 0.26385536789894104},
 {'token': ' work', 'probability': 0.23955577611923218},
 {'token': ' together', 'probability': 0.2017972469329834},
 {'token': ',', 'probability': 0.24905261397361755},
 {'token': ' and', 'probability': 0.17192186415195465},
 {'token': ' where', 'probability': 0.06549273431301117},
 {'token': ' people', 'probability': 0.21751831471920013},
 {'token': ' can', 'probability': 0.6969819664955139},
 {'token': ' live', 'probability': 0.12064892798662186},
 {'token': ' and', 'probability': 0.7049790024757385},
 {'token': ' work', 'probability': 0.9612843990325928},
 {'token': ' together', 'probability':

In [45]:
df_train_sample = df_train.head(10).copy()

train_args = {
    'title': 'gpt2',
    'num_train_epochs' : 50,
    'mlm': False,
    'optimizer': 'adamw_torch',
    'per_device_train_batch_size': 10,
    'per_device_eval_batch_size': 10,
    'warmup_steps': 20,
    'weight_decay': 0.01,
    'logging_steps': 10,
    'output_dir': './results',
    'logging_dir': './logs',
    'save_model': True,
}

# Set training data and targets
x = df_train_sample['question']
y = df_train_sample['question']

# Train model
lm_gpt2_train.fit(x, y, train_args, instruct=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

***** Running training *****
  Num examples = 10
  Num Epochs = 50
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 50
  Number of trainable parameters = 81912576
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Setting up training in autoregressive format...


Step,Training Loss
10,5.2478
20,3.4191
30,1.658
40,0.9854
50,0.8517




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./results/model_gpt2
Configuration saved in ./results/model_gpt2/config.json
Configuration saved in ./results/model_gpt2/generation_config.json
Model weights saved in ./results/model_gpt2/pytorch_model.bin


In [46]:
lm_gpt2.predict('When did')['text']

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}



'When did you know that the first time you saw a woman in a car, you were shocked to find that she was wearing a black dress.'

In [47]:
lm_gpt2_train.predict('When did', max_length=50)['text']

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}



'When did Beyonce start becoming popular? Beyonce was born? Beyonce was born in the U.S. and raised in the U.S. and was raised in the UK? Beyonce was a singer? Beyonce was a singer?'

In [257]:
# FLAN T5 base
tokenizer_flan = AutoTokenizer.from_pretrained('google/flan-t5-base', mirror='https://huggingface.co')
model_flan = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')

model_flan_train=copy.deepcopy(model_flan)
lm_flan = ModelPack(model_flan, tokenizer_flan, input_block_size=10, padding_length=200, source='huggingface')
lm_flan_train = ModelPack(model_flan_train, tokenizer_flan, input_block_size=10, padding_length=200, source='huggingface')

loading file spiece.model from cache at /Users/williamzheng/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/c782cba52f8ea6a704240578055cf1c3fc2f2ca9/spiece.model
loading file tokenizer.json from cache at /Users/williamzheng/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/c782cba52f8ea6a704240578055cf1c3fc2f2ca9/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /Users/williamzheng/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/c782cba52f8ea6a704240578055cf1c3fc2f2ca9/special_tokens_map.json
loading file tokenizer_config.json from cache at /Users/williamzheng/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/c782cba52f8ea6a704240578055cf1c3fc2f2ca9/tokenizer_config.json
loading configuration file config.json from cache at /Users/williamzheng/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/c782cba52f8ea6a704240578055cf1c3fc2f2ca9/config.json
Model config T

In [258]:
df_train_sample = df_train.head(10).copy()

train_args = {
    'title': 'flan_t5',
    'num_train_epochs' : 50,
    'mlm': False,
    'optimizer': 'adamw_torch',
    'per_device_train_batch_size': 10,
    'per_device_eval_batch_size': 10,
    'warmup_steps': 20,
    'weight_decay': 0.01,
    'logging_steps': 10,
    'save_model': True,
}

# Set training data and targets
x = df_train_sample['question']
y = df_train_sample['answer']

# Train model
lm_flan_train.fit(x, y, train_args, instruct=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 10
  Num Epochs = 50
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 50
  Number of trainable parameters = 247577856
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Setting up training in sequence to sequence format...


Step,Training Loss
10,2.3681
20,1.6826
30,1.076
40,0.7545
50,0.6295




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./results/model_flan_t5
Configuration saved in ./results/model_flan_t5/config.json
Configuration saved in ./results/model_flan_t5/generation_config.json
Model weights saved in ./results/model_flan_t5/pytorch_model.bin


In [259]:
lm_flan.predict('What is the meaning of life?')['text']

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



'life is a cycle of life'

In [263]:
lm_flan_train.predict('What is the meaning of life?')['text']

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



'Life is the meaning of all that comes to you.'