In [20]:
import torch
from models.common import RangeWeight
# from models.modeling_mistral import MistralForCausalLM
# from models.modeling_llama_442 import LlamaForCausalLM
from models.modeling_llama import LlamaForCausalLM

from transformers import AutoTokenizer

In [21]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_name = "/data/hf_models/Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
print(model.config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.40.0",
  "use_cache": true,
  "vocab_size": 128256
}



In [80]:
prompt = """Answer the following question based on the documents provided.
Document 1: Today, the weather is raining.
Document 2: Today, the weather is sunny.
Question: What is the weather like today?"""
messages = [
    {"role": "user", "content": prompt},
]

chat_prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(chat_prompt, add_special_tokens=False, return_tensors="pt").to(
    model.device
)

In [82]:
print(len(prompt))
print(len(inputs["input_ids"][0]))
print(inputs["input_ids"][0])
print(
        tokenizer.decode(inputs["input_ids"][0])
    )

188
51
tensor([128000, 128006,    882, 128007,    271,  16533,    279,   2768,   3488,
          3196,    389,    279,   9477,   3984,    627,   7676,    220,     16,
            25,  11450,     11,    279,   9282,    374,  84353,    627,   7676,
           220,     17,     25,  11450,     11,    279,   9282,    374,  40798,
           627,  14924,     25,   3639,    374,    279,   9282,   1093,   3432,
            30, 128009, 128006,  78191, 128007,    271], device='cuda:0')
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Answer the following question based on the documents provided.
Document 1: Today, the weather is raining.
Document 2: Today, the weather is sunny.
Question: What is the weather like today?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [24]:
outputs = model.generate(
    **inputs,
    do_sample=False,
    max_new_tokens=140,
    pad_token_id=tokenizer.eos_token_id,
)
response = outputs[0][inputs["input_ids"].shape[-1] : -1]
print("Model output WITHOUT attention scaling:")
print(tokenizer.decode(response))



Model output WITHOUT attention scaling:
A classic example of a conflicting information problem!

Since we have two documents with different information about the weather, we can't determine the correct answer based on these documents alone. We need more information or a way to resolve the conflict.

In this case, I would say that the answer is "Unknown" or "Inconclusive" because we have contradictory information.


In [25]:
range_weights = [
    RangeWeight(15, 26, 1),
    RangeWeight(26, 37, 0),
    # RangeWeight(15, 28, 1),
    # RangeWeight(28, 41, 0),
]

for range_weight in range_weights:
    print(f"Giving the following text a weight of {range_weight.weight}:")
    print(
        tokenizer.decode(inputs["input_ids"][0][range_weight.start : range_weight.end])
    )
    print("---")

Document
Giving the following text a weight of 1:
Document 1: Today, the weather is raining.

---
Giving the following text a weight of 0:
Document 2: Today, the weather is sunny.

---


In [19]:
outputs = model.generate(
    **inputs,
    do_sample=False,
    max_new_tokens=120,
    pad_token_id=tokenizer.eos_token_id,
    range_weights=range_weights,
)
response = outputs[0][:-1]
print("Model output WITH attention scaling:")
print(tokenizer.decode(response))

Model output WITH attention scaling:
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Answer the following question based on the documents provided.
Document 1: Today, the weather is raining.
Document 2: Today, the weather is sunny.
Question: What is the weather like today?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

A simple question!

Since there are only two documents, and one of them says "Today, the weather is raining", I would conclude that the weather is indeed raining today.

So, the answer is: The weather is raining.


### New Test Case

In [67]:
prompt = """
Document 1: Today, the weather is raining.
Document 2: Today, the weather is sunny.
Repeat the above content"""
messages = [
    {"role": "user", "content": prompt},
]

chat_prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(chat_prompt, add_special_tokens=False, return_tensors="pt").to(
    model.device
)
print(len(prompt))
print(len(inputs["input_ids"][0]))
print(inputs["input_ids"][0])
print(
        tokenizer.decode(inputs["input_ids"][0][3])
    )

109
36
tensor([128000, 128006,    882, 128007,    271,   7676,    220,     16,     25,
         11450,     11,    279,   9282,    374,  84353,    627,   7676,    220,
            17,     25,  11450,     11,    279,   9282,    374,  40798,    627,
         39818,    279,   3485,   2262, 128009, 128006,  78191, 128007,    271],
       device='cuda:0')
<|end_header_id|>


In [78]:
range_weights = [
    # RangeWeight(5, 16, 1),
    # RangeWeight(16, 27, 0),
     RangeWeight(5, 17, 1),
    RangeWeight(17, 27, 0),
]

for range_weight in range_weights:
    print(f"Giving the following text a weight of {range_weight.weight}:")
    print(
        tokenizer.decode(inputs["input_ids"][0][range_weight.start : range_weight.end])
    )
    print("---")

Giving the following text a weight of 1:
Document 1: Today, the weather is raining.
Document
---
Giving the following text a weight of 0:
 2: Today, the weather is sunny.

---


In [79]:
outputs = model.generate(
    **inputs,
    do_sample=False,
    max_new_tokens=120,
    pad_token_id=tokenizer.eos_token_id,
    range_weights=range_weights,
)
response = outputs[0][:-1]
print("Model output WITH attention scaling:")
print(tokenizer.decode(response))

Model output WITH attention scaling:
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Document 1: Today, the weather is raining.
Document 2: Today, the weather is sunny.
Repeat the above content<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here is the repeated content:

Document 1: Today, the weather is raining.
Document 1: Today, the weather is raining.


In [68]:
outputs = model.generate(
    **inputs,
    do_sample=False,
    max_new_tokens=120,
    pad_token_id=tokenizer.eos_token_id,
    range_weights=None,
)
response = outputs[0][:-1]
print("Model output WITH attention scaling:")
print(tokenizer.decode(response))

Model output WITH attention scaling:
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Document 1: Today, the weather is raining.
Document 2: Today, the weather is sunny.
Repeat the above content<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here are the repeated documents:

Document 1: Today, the weather is raining.

Document 2: Today, the weather is sunny.
