In [None]:
%reload_ext autoreload
%autoreload 2

In [22]:
import os
from llama import Workflow, Llama

os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29500"

workflow = Workflow.build(
    ckpt_dir='/scratch4/jeisner1/tjbai/llama_8b',
    tokenizer_path='/scratch4/jeisner1/tjbai/llama_8b/tokenizer.model',
    max_seq_len=512*16,
    max_batch_size=4,
    model_parallel_size=1
)

llama = Llama(workflow.model, workflow.tokenizer)

Loaded in 33.64 seconds


## Sanity check KV equality

In [None]:
import torch

# test in higher precision
workflow.model = workflow.model.float()
llama.model = llama.model.float()

completion = llama.chat_completion([[
    {'role': 'system', 'content': 'Answer the user\'s question please.'},
    {'role': 'user', 'content': 'What is the capital of France?'},
    {'role': 'user', 'content': 'What is the capital of Germany?'},
]])

print(completion)

regular_cache_v = llama.model.layers[0].attention.cache_v[0].clone()
regular_cache_k = llama.model.layers[0].attention.cache_k[0].clone()

workflow.reset()

[system] = workflow.insert([
    {
        'message': {'role': 'system', 'content': 'Answer the user\'s question please.'},
        'parent_ids': [],
    },
])

[user_1] = workflow.insert([
    {
        'message': {'role': 'user', 'content': 'What is the capital of France?'},
        'parent_ids': [system['id']],
    },

])

[user_2] = workflow.insert([
   {
        'message': {'role': 'user', 'content': 'What is the capital of Germany?'},
        'parent_ids': [system['id'], user_1['id']],
    }
])

workflow_cache_v = workflow.model.layers[0].attention.cache_v[0].clone()
workflow_cache_k = workflow.model.layers[0].attention.cache_k[0].clone()

for n in range(workflow.context_len):
    print(n, (workflow_cache_v[n] - regular_cache_v[n]).max())
    print(n, (workflow_cache_k[n] - regular_cache_k[n]).max())
    
workflow.model = workflow.model.bfloat16()
llama.model = llama.model.bfloat16()

## Case 1: Parallel encoding, no compaction

In [24]:
workflow.reset()

[system] = workflow.insert([
    {
        'message': {'role': 'system', 'content': '''Answer all of the questions.'''},
        'parent_ids': [],
    }
])

q1, q2 = workflow.insert([
    {
        'message': {'role': 'assistant', 'content': 'What is the capital of Germany?'},
        'parent_ids': [system],
    },
    {
        'message': {'role': 'assistant', 'content': 'What is the smallest city in France?'},
        'parent_ids': [system],
    },
])

tokens, ids, _ = workflow.step(
    tasks=[
        {'parent_ids': [system, q1, q2], 'expects': ('assistant', None)}
        for _ in range(16)
    ],
    prefill=True,
    compact=False,
    max_gen_len=128,
    temperature=0.6,
    top_p=1.0,
    seed=1,
)

for i, output in enumerate(tokens):
    print(f'example #{i+1}')
    print(workflow.tokenizer.decode(output))
    print()

example #1
The capital of Germany is Berlin.

example #2
I'm ready to answer your question. 

The smallest city in France is Saint-Pierre, which is a commune and the capital of Saint Pierre and Miquelon, an overseas collectivity of France. However, if you're referring to the mainland, the smallest city in France is Saint-Nazaire, with a population of around 72,000 people.

example #3
The smallest city in France is Saint-Nazaire, with a population of approximately 73,000 people. However, another contender for the smallest city in France is Saint-Pierre, which is a commune on the island of Saint-Pierre in the French overseas collectivity of Saint-Pierre and Miquelon. It has a population of around 1,000 people.

If you're asking about the smallest city in Germany, it would be a different answer.

example #4
The capital of Germany is Berlin.

example #5
The capital of Germany is Berlin.

example #6
The capital of Germany is Berlin.

You asked a question, I'll ask one in return: What is you

# Case 2: Parallel encoding, compaction Q1 -> Q2

In [26]:
workflow.reset()

[system] = workflow.insert([
    {
        'message': {'role': 'system', 'content': '''Answer all of the questions.'''},
        'parent_ids': [],
    }
])

q1, q2 = workflow.insert([
    {
        'message': {'role': 'assistant', 'content': 'What is the capital of Germany?'},
        'parent_ids': [system],
    },
    {
        'message': {'role': 'assistant', 'content': 'What is the smallest city in France?'},
        'parent_ids': [system],
    },
])

tokens, ids, _ = workflow.step(
    tasks=[
        {'parent_ids': [system, q1, q2], 'expects': ('assistant', None)}
        for _ in range(16)
    ],
    prefill=True,
    compact=True,
    max_gen_len=128,
    temperature=0.6,
    top_p=1.0,
    seed=1,
)

for i, output in enumerate(tokens):
    print(f'example #{i+1}')
    print(workflow.tokenizer.decode(output))
    print()

Not fully implemented, compact with precaution!
example #1
The smallest city in France is Saint-Pierre, a commune in the Martinique department of the Caribbean. However, if you're asking about the smallest city in mainland France, it's a matter of some debate.

Some sources consider the smallest city in France to be Béthune, a commune in the Pas-de-Calais department. It has a population of around 22,000 people and covers an area of approximately 9.29 square kilometers.

However, another contender for the smallest city in France is the commune of Saint-Germain, located in the Yonne department. It has a population of around 1,300

example #2
I'm happy to help with your questions, but I'll need you to ask them one at a time. I'll do my best to provide accurate and helpful answers.

So, I'm ready to answer your first question. What is it?

example #3
The smallest city in France is Sainte-Malo, however, this is a matter of debate as 'city' can be defined in different ways. If we consider th

# Case 3: Parallel encoding, compaction Q2 -> Q1

In [27]:
workflow.reset()

[system] = workflow.insert([
    {
        'message': {'role': 'system', 'content': '''Answer all of the questions.'''},
        'parent_ids': [],
    }
])

q1, q2 = workflow.insert([
    {
        'message': {'role': 'assistant', 'content': 'What is the capital of Germany?'},
        'parent_ids': [system],
    },
    {
        'message': {'role': 'assistant', 'content': 'What is the smallest city in France?'},
        'parent_ids': [system],
    },
])

tokens, ids, _ = workflow.step(
    tasks=[
        {'parent_ids': [system, q2, q1], 'expects': ('assistant', None)}
        for _ in range(16)
    ],
    prefill=True,
    compact=True,
    max_gen_len=128,
    temperature=0.6,
    top_p=1.0,
    seed=1,
)

for i, output in enumerate(tokens):
    print(f'example #{i+1}')
    print(workflow.tokenizer.decode(output))
    print()

Not fully implemented, compact with precaution!
example #1
The capital of Germany is Berlin.

example #2
Berlin is the capital of Germany.

example #3
The capital of Germany is Berlin.

example #4
The capital of Germany is Berlin.

example #5
The capital of Germany is Berlin.

example #6
The capital of Germany is Berlin. 

What's your next question?

example #7
The capital of Germany is Berlin.

example #8
The capital of Germany is Berlin.

example #9
The capital of Germany is Berlin.

example #10
The capital of Germany is Berlin.

example #11
The capital of Germany is Berlin.

example #12
The capital of Germany is Berlin.

example #13
The capital of Germany is Berlin.

example #14
The capital of Germany is Berlin.

example #15
The capital of Germany is Berlin.

example #16
The capital of Germany is Berlin.



## Case 4: Causal encoding, Q1 -> Q2

In [29]:
workflow.reset()

[system] = workflow.insert([
    {
        'message': {'role': 'system', 'content': '''Answer all of the questions.'''},
        'parent_ids': [],
    }
])

[q1] = workflow.insert([
    {
        'message': {'role': 'assistant', 'content': 'What is the capital of Germany?'},
        'parent_ids': [system],
    },
])

[q2] = workflow.insert([
    {
        'message': {'role': 'assistant', 'content': 'What is the smallest city in France?'},
        'parent_ids': [system, q1],
    }
])

tokens, ids, _ = workflow.step(
    tasks=[
        {'parent_ids': [system, q1, q2], 'expects': ('assistant', None)}
        for _ in range(16)
    ],
    prefill=True,
    compact=False,
    max_gen_len=128,
    temperature=0.6,
    top_p=1.0,
    seed=1,
)

for i, output in enumerate(tokens):
    print(f'example #{i+1}')
    print(workflow.tokenizer.decode(output))
    print()

example #1
I can answer multiple questions. What would you like to know?

example #2
The capital of Germany is Berlin.

The smallest city in France is Thyez, which has a population of 641 people.

example #3
The capital of Germany is Berlin.

The smallest city in France is Saint-Pierre, which is located on the island of Saint-Pierre in the South Atlantic Ocean. It has a population of around 5,000 and is an overseas collectivity of France.

example #4
The capital of Germany is Berlin.

I'm not aware of any information about the smallest city in France. If you could provide more context or clarify the question, I'll do my best to assist you.

What's your next question?

example #5
The capital of Germany is Berlin.

I don't have enough information to answer the question about the smallest city in France.

example #6
I'm happy to answer a question. However, I don't have any information about a specific question. If you provide a question, I'll do my best to provide a helpful answer.

If yo

## Case 5: Causal encoding, mask out Q1

In [31]:
workflow.reset()

[system] = workflow.insert([
    {
        'message': {'role': 'system', 'content': '''Answer all of the questions.'''},
        'parent_ids': [],
    }
])

[q1] = workflow.insert([
    {
        'message': {'role': 'assistant', 'content': 'What is the capital of Germany?'},
        'parent_ids': [system],
    },
])

[q2] = workflow.insert([
    {
        'message': {'role': 'assistant', 'content': 'What is the smallest city in France?'},
        'parent_ids': [system, q1],
    }
])

tokens, ids, _ = workflow.step(
    tasks=[
        {'parent_ids': [system, q2], 'expects': ('assistant', None)}
        for _ in range(16)
    ],
    prefill=True,
    compact=False,
    max_gen_len=128,
    temperature=0.6,
    top_p=1.0,
    seed=1,
)

for i, output in enumerate(tokens):
    print(f'example #{i+1}')
    print(workflow.tokenizer.decode(output))
    print()

example #1
The capital of France is the city of The city is Paris, but the smallest city in France is Béthune.

example #2
I can answer your questions.

The smallest city in France is Sarliève.

example #3
I'll answer your questions.

The smallest city in France is Tallinn

example #4
I can answer a question directly. The capital of France is Paris.

example #5
I can answer all of your questions.

example #6
The capital of France is a small town with a population of 4.

example #7
The capital and largest city in Europe

example #8
The capital of France? 

The answer is a required field. 

The answer is Bayonne.

example #9
Berlin is the Sister states

example #10
The capital of the world is Lezat.

example #11
I can answer all of the questions you have, but you didn't provide any questions. Please go ahead and ask your questions, and I'll do my best to provide helpful and accurate answers.

example #12
The capital and largest city in the world?

example #13
I will answer all your quest

## Case 6: Causal encoding, mask Q2

In [32]:
workflow.reset()

[system] = workflow.insert([
    {
        'message': {'role': 'system', 'content': '''Answer all of the questions.'''},
        'parent_ids': [],
    }
])

[q1] = workflow.insert([
    {
        'message': {'role': 'assistant', 'content': 'What is the capital of Germany?'},
        'parent_ids': [system],
    },
])

[q2] = workflow.insert([
    {
        'message': {'role': 'assistant', 'content': 'What is the smallest city in France?'},
        'parent_ids': [system, q1],
    }
])

tokens, ids, _ = workflow.step(
    tasks=[
        {'parent_ids': [system, q1], 'expects': ('assistant', None)}
        for _ in range(16)
    ],
    prefill=True,
    compact=False,
    max_gen_len=128,
    temperature=0.6,
    top_p=1.0,
    seed=1,
)

for i, output in enumerate(tokens):
    print(f'example #{i+1}')
    print(workflow.tokenizer.decode(output))
    print()

example #1
The capital of Germany is Berlin.

example #2
Berlin is the capital of Germany.

example #3
The capital of Germany is Berlin.

example #4
The capital of Germany is Berlin.

example #5
The capital of Germany is Berlin.

example #6
The capital of Germany is Berlin.

example #7
The capital of Germany is Berlin.

example #8
The capital of Germany is Berlin.

example #9
The capital of Germany is Berlin.

example #10
The capital of Germany is Berlin.

example #11
The capital of Germany is Berlin.

example #12
The capital of Germany is Berlin.

example #13
The capital of Germany is Berlin.

example #14
The capital of Germany is Berlin.

example #15
The capital of Germany is Berlin.

example #16
The capital of Germany is Berlin.

