In [1]:
import pprint
import os

# Load Model

In [28]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'


bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, you need an access token
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


In [29]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [30]:
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids

[[1, 29871, 13, 29950, 7889, 29901], [1, 29871, 13, 28956, 13]]

In [31]:
import torch

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [32]:
from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [33]:
generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True, 
    task='text-generation',
    stopping_criteria=stopping_criteria, 
    temperature=0.1, 
    max_new_tokens=512,
    repetition_penalty=1.1 

In [34]:
res = generate_text("The quick brown fox")

In [35]:
print(res[0]["generated_text"])

The quick brown fox jumps over the lazy dog. Hinweis: This is a 50-letter word, so it's a good one to practice!

Answer: The quick brown fox jumps over the lazy dog.


In [40]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

out = llm(prompt="Complete the following text: \nThe quick brown fox ...",)


In [41]:
print(out)


 nobody knows the exact origin of this pangram, but it has been used as a demonstration of a language's range and complexity since at least the 19th century. It is often cited as an example of a sentence that uses all the letters of the alphabet at least once.

The word "pangram" comes from the Greek words "pan" meaning "all" and "gramma" meaning "letter". A pangram is a sentence or phrase that uses every letter of the alphabet at least once. Pangrams are often used in linguistics and language learning to demonstrate the range and complexity of a language.

Here are some examples of pangrams in different languages:

* English: The quick brown fox jumps over the lazy dog.
* Spanish: El rápido perro gris saltó sobre el perro lento.
* French: Le chien rapide brun saute sur le chien lent.
* German: Der schnelle braune Hund springt über den langen Hund.
* Chinese (Mandarin): 快速的灰色狗跳过懒犬。
* Japanese: 速い灰色犬は、懶犬を跳び越します。

Pangrams can be found in many languages, and they are often used as a too

# Zero-shot Summarization

In [42]:
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate

In [43]:
prompt_template = """Write a concise summary of the following:
"{document}"
CONCISE SUMMARY:"""

prompt = PromptTemplate(input_variables = ["document"],template=prompt_template)


In [45]:
llm.get_num_tokens(prompt.format(**{"document":validation_sample['document']}))

506

In [46]:
pprint.pprint(prompt.format(**{"document":validation_sample['document']}))

('Write a concise summary of the following:\n'
 '"This is very important, as it will protect your hair from heat damage. '
 'Heat-damaged hair can often look dry, frizzy, or ratted. To prevent this '
 'from happening, apply a heat protectant spray to your hair, focusing on the '
 'ends. You should have the bottom third or bottom fourth of your hair loose. '
 'The thicker you hair is, the more sections you will need to work with. Take '
 'a small section of hair, and place a round brush under it, as close to your '
 'roots as you can. Turn on your hairdryer, and place it right over the brush. '
 'Keep the hairdryer directly over the brush at all times. Do not pull the '
 'brush completely past the ends of your hair. If your hair is not smooth, '
 'pull the brush completely past your hair and start again. When your hair is '
 'smooth, rotate the brush back towards your scalp, rolling the hair around '
 'it. Leave the brush in your hair as it cools. Once it is cool to the touch, '
 'caref

In [47]:
out = llm(prompt=prompt.format(**{"document":validation_sample['document']}))

In [48]:
pprint.pprint(out)

('\n'
 'To prevent heat damage when straightening your hair, apply a heat protectant '
 'spray to the ends of your hair and focus on the lower sections. Use a round '
 'brush and a hairdryer to straighten your hair in small sections, keeping the '
 'brush close to your roots. Repeat the process until all of your hair is '
 'straightened, then use a hair straightener to add extra volume.')
