# Preliminary Investigations on Parallel Models

## Environment preparation

### Imports

In [1]:
import sys
sys.path.append('/home/vincenzoscotti/Projects/transformer_wrappers/src')

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformer_wrappers.wrappers import ParallelTransformerWrapper, ParallelCausalLMWrapper

In [4]:
import pickle

### Constants

In [5]:
COLOURS = [f'C{i}' for i in range(10)]
STYLES = ['solid', 'dotted', 'dashed', 'dashdot']

In [6]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

In [7]:
TOKEN = '...'  # HuggingFace token

In [8]:
# MODEL = 'gpt2-xl'
MODEL = 'mistralai/Mistral-7B-Instruct-v0.2'  
# MODEL = 'meta-llama/Llama-2-7b-hf'
# MODEL = 'google/gemma-7b'
MODEL_CONFIGS = {
    'torch_dtype': torch.bfloat16,
    'attn_implementation': 'eager',
    'device_map': DEVICE,
    'token': TOKEN
}

# TOKENIZER = 'gpt2-xl'
TOKENIZER = 'mistralai/Mistral-7B-Instruct-v0.2'  
# TOKENIZER = 'meta-llama/Llama-2-7b-hf'
# TOKENIZER = 'google/gemma-7b'
TOKENIZER_CONFIGS = {'token': TOKEN}

### Global

In [9]:
input_string = 'The quick brown fox jumps over the lazy dog.'

## Tests

### Transformer

#### Baseline

In [10]:
model = AutoModel.from_pretrained(MODEL, **MODEL_CONFIGS)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER, **TOKENIZER_CONFIGS)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.09s/it]


In [11]:
input_encodings = tokenizer(input_string, return_tensors='pt').to(DEVICE)

In [12]:
output = model(
    **input_encodings, 
    return_dict=True, 
    output_attentions=True, 
    use_cache=True, 
    output_hidden_states=True
)

In [13]:
with open('.tmp_output_transformer.pkl', 'wb') as f:
    pickle.dump(output, f)

#### Wrapper

In [10]:
model = ParallelTransformerWrapper.from_pretrained(MODEL, model_kwargs=MODEL_CONFIGS, tokenizer_name_or_path=TOKENIZER, tokenizer_kwargs=TOKENIZER_CONFIGS)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.42it/s]


In [11]:
input_encodings = model.tokenizer(input_string, return_tensors='pt').to(DEVICE)

In [12]:
output_wrapper = model(
    **input_encodings,
    return_dict=True,
    output_attentions=True,
    use_cache=True,
    output_hidden_states=True,
    return_attention_output=True,  # Self-attention layer output
    return_feed_forward_output=True
)

In [13]:
with open('.tmp_output_wrapper_transformer.pkl', 'wb') as f:
    pickle.dump(output_wrapper, f)

#### Comparison

In [14]:
with open('.tmp_output_wrapper_transformer.pkl', 'rb') as f:
    output_wrapper = pickle.load(f)

In [15]:
with open('.tmp_output_transformer.pkl', 'rb') as f:
    output = pickle.load(f)

In [16]:
assert torch.equal(
    output.last_hidden_state, output_wrapper['output_hidden_state']
), '`last_hidden_state` not matching.'

In [17]:
for i, (output_hidden_state, output_wrapper_hidden_state) in enumerate(zip(
        output.hidden_states, output_wrapper['hidden_states']
)):
    if i == 0:
        assert torch.equal(
            output_hidden_state, output_wrapper_hidden_state
        ), 'Initial embedding tensors not matching.'
    if i == len(model.layers):
        assert torch.equal(
            output_hidden_state, model.norm(output_wrapper_hidden_state)
        ), f'`hidden_state` tensors at layer {i} not matching.'
    else:
        assert torch.equal(
            output_hidden_state, output_wrapper_hidden_state
        ), f'`hidden_state` tensors at layer {i} not matching.'

In [18]:
for i, (
        output_hidden_state, prev_output_wrapper_hidden_state, attn_output_wrapper, ffnn_output_wrapper
) in enumerate(zip(
        output.hidden_states[1:],
        output_wrapper['hidden_states'][:-1],
        output_wrapper['attention_outputs'],
        output_wrapper['feed_forward_outputs']
), start=1):
    output_wrapper_hidden_state = prev_output_wrapper_hidden_state + attn_output_wrapper + ffnn_output_wrapper
    if i == len(model.layers):
        assert torch.equal(
            output_hidden_state, model.norm(output_wrapper_hidden_state)
        ), f'Composed `hidden_state` tensors at layer {i} not matching.'
    else:
        assert torch.equal(
            output_hidden_state, output_wrapper_hidden_state
        ), f'Composed `hidden_state` tensors at layer {i} not matching.'

In [19]:
for i, (output_past_key_values, output_wrapper_past_key_values) in enumerate(zip(
        output.past_key_values, output_wrapper['cache'],
), start=1):
    output_past_keys, output_past_values = output_past_key_values
    output_wrapper_past_keys, output_wrapper_past_values = output_wrapper_past_key_values
    assert torch.equal(
        output_past_keys, output_wrapper_past_keys
    ), f'`key` tensors at layer {i} not matching.'
    assert torch.equal(
        output_past_values, output_wrapper_past_values
    ), f'`value` tensors at layer {i} not matching.'

In [20]:
for i, (output_attentions, output_wrapper_attentions) in enumerate(zip(
        output.attentions, output_wrapper['attention_weights']
), start=1):
    assert torch.equal(
        output_attentions, output_wrapper_attentions
    ), f'`attentions` tensors at layer {i} not matching.'

### Language Model

#### Baseline

In [10]:
model = AutoModelForCausalLM.from_pretrained(MODEL, **MODEL_CONFIGS)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER, **TOKENIZER_CONFIGS)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.43it/s]


In [11]:
input_encodings = tokenizer(input_string, return_tensors='pt').to(DEVICE)

In [12]:
output = model.generate(input_encodings.input_ids, do_sample=False, max_length=16)

In [13]:
with open('.tmp_output_lm.pkl', 'wb') as f:
    pickle.dump(output, f)

#### Wrapper

In [10]:
model = ParallelCausalLMWrapper.from_pretrained(MODEL, model_kwargs=MODEL_CONFIGS, tokenizer_name_or_path=TOKENIZER, tokenizer_kwargs=TOKENIZER_CONFIGS)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.45it/s]


In [11]:
input_encodings = model.tokenizer(input_string, return_tensors='pt').to(DEVICE)

In [None]:
output_wrapper = model.generate(input_encodings.input_ids, do_sample=False, max_length=16)

In [None]:
with open('.tmp_output_wrapper_lm.pkl', 'wb') as f:
    pickle.dump(output_wrapper, f)

#### Comparison

In [14]:
with open('.tmp_output_wrapper_lm.pkl', 'rb') as f:
    output_wrapper = pickle.load(f)

In [15]:
with open('.tmp_output_lm.pkl', 'rb') as f:
    output = pickle.load(f)

In [16]:
assert torch.equal(
    output, output_wrapper['input_ids']
), 'Generated sequences not matching.'

### Generated text

In [9]:
model = ParallelCausalLMWrapper.from_pretrained(MODEL, model_kwargs=MODEL_CONFIGS, tokenizer_name_or_path=TOKENIZER, tokenizer_kwargs=TOKENIZER_CONFIGS)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.12it/s]


In [18]:
input_encodings = model.tokenizer("Q: Was Max Born the grandfather of Olivia Newton-John? A:", return_tensors='pt').to(DEVICE)
# input_encodings = model.tokenizer("Q: Son of an actor, this American guitarist and rock singer released many songs and albums and toured with his band. His name is \"Elvis\" what? A:", return_tensors='pt').to(DEVICE)

In [19]:
for i in range(1, 9):  # len(model.transformer_wrapper.layers) + 1):
    if len(model.transformer_wrapper.layers) % i == 0:
        output_wrapper = model.generate(input_encodings.input_ids, rate=i, do_sample=False, max_length=64, recursive=True)
        output_text = model.tokenizer.decode(output_wrapper['input_ids'][0])
        print(f'Prallelization rate: {i:2d} - Generated text: {repr(output_text)}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  1 - Generated text: '<s> Q: Son of an actor, this American guitarist and rock singer released many songs and albums and toured with his band. His name is "Elvis" what? A: Presley is the correct answer. Elvis Presley was an American actor and musician, a major cultural icon of the 2'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  2 - Generated text: '<s> Q: Son of an actor, this American guitarist and rock singer released many songs and albums and toured with his band. His name is "Elvis" what? A: The name is not "Elvis" - it\'s "Stevie" - the correct answer is "Stevie'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  4 - Generated text: '<s> Q: Son of an actor, this American guitarist and rock singer released many songs and albums and toured with his band. His name is "Elvis" what? A: Elvis\n````\n\n``````````````````````````````````````'
Prallelization rate:  8 - Generated text: '<s> Q: Son of an actor, this American guitarist and rock singer released many songs and albums and toured with his band. His name is "Elvis" what? A: helly, man, and thespand, . . . . . . . . . . . . . . . .'


In [15]:
for i in range(1, 9):  # len(model.transformer_wrapper.layers) + 1):
    if len(model.transformer_wrapper.layers) % i == 0:
        output_wrapper = model.generate(input_encodings.input_ids, rate=i, do_sample=False, max_length=64, recursive=False)
        output_text = model.tokenizer.decode(output_wrapper['input_ids'][0])
        print(f'Prallelization rate: {i:2d} - Generated text: {repr(output_text)}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  1 - Generated text: '<s> Q: Was Max Born the grandfather of Olivia Newton-John? A: No, Max Born was not the grandfather of Olivia Newton-John. Max Born was a German-born, British-Jewish physicist, and Olivia Newton-John is an Australian-born singer and actress.'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  2 - Generated text: '<s> Q: Was Max Born the grandfather of Olivia Newton-John? A: 1. S. A.\n\n1. A. D. A. A. D. A. A. D. A. A. D. A. A. D. A. A. D. A.'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  4 - Generated text: '<s> Q: Was Max Born the grandfather of Olivia Newton-John? A:,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
Prallelization rate:  8 - Generated text: '<s> Q: Was Max Born the grandfather of Olivia Newton-John? A: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #'


In [16]:
input_encodings = model.tokenizer("Q: Was Olivia Newton-John the granddaughter of Max Born? A:", return_tensors='pt').to(DEVICE)

In [13]:
for i in range(1, 9):  # len(model.transformer_wrapper.layers) + 1):
    if len(model.transformer_wrapper.layers) % i == 0:
        output_wrapper = model.generate(input_encodings.input_ids, rate=i, do_sample=False, max_length=64, recursive=True)
        output_text = model.tokenizer.decode(output_wrapper['input_ids'][0])
        print(f'Prallelization rate: {i:2d} - Generated text: {repr(output_text)}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  1 - Generated text: '<s> Q: Was Olivia Newton-John the granddaughter of Max Born? A: No, Olivia Newton-John is not the granddaughter of Max Born. Max Born was a German-born physicist, and Olivia Newton-John is an English-Australian singer and actress.</s>'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  2 - Generated text: '<s> Q: Was Olivia Newton-John the granddaughter of Max Born? A: No. The two are unrelated. The late Max Born was a German-British physicist, mathematician, and philosopher of science, best-known for his work in the field of quantum'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  4 - Generated text: "<s> Q: Was Olivia Newton-John the granddaughter of Max Born? A: O Ekwe-E, I meanwhile? A: Good dayn'nènkènènùkụnụnùkụnùkụnùkụnù\n\n"
Prallelization rate:  8 - Generated text: '<s> Q: Was Olivia Newton-John the granddaughter of Max Born? A:Aheadousadaysaboutabanquabanquabakaard 20 20 20 20 20 20 20 20 20dayday  20 '


In [17]:
for i in range(1, 9):  # len(model.transformer_wrapper.layers) + 1):
    if len(model.transformer_wrapper.layers) % i == 0:
        output_wrapper = model.generate(input_encodings.input_ids, rate=i, do_sample=False, max_length=64, recursive=False)
        output_text = model.tokenizer.decode(output_wrapper['input_ids'][0])
        print(f'Prallelization rate: {i:2d} - Generated text: {repr(output_text)}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  1 - Generated text: '<s> Q: Was Olivia Newton-John the granddaughter of Max Born? A: No, Olivia Newton-John is not the granddaughter of Max Born. Max Born was a German-born physicist, and Olivia Newton-John is an English-Australian singer and actress.</s>'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  2 - Generated text: '<s> Q: Was Olivia Newton-John the granddaughter of Max Born? A: 1. S.\n\n1. A: 1. A: 1. A: 1. A: 1. A: 1. A: 1. A: 1. A'


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prallelization rate:  4 - Generated text: '<s> Q: Was Olivia Newton-John the granddaughter of Max Born? A:,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
Prallelization rate:  8 - Generated text: '<s> Q: Was Olivia Newton-John the granddaughter of Max Born? A: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #'
