In [1]:
# Install the needed packages.
!pip install -q -U transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q -U bitsandbytes einops sentencepiece

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
# Parameter Efficient Fine Tuning shorten for (peft) is a package that is used in fine tuning the model.
# Peft method focuses on training only a subset of the pre-trained model's parameters while freezing the rest of parameters (save gpu Vram).
from peft import PeftModel
# Import torch which is an open source ML library used for creating deep neural networks.
import torch
# The transformers library is used to download a pretrained model and a pretrained tokenizer.
# Also the BitsAndBytesConfig is the configuration that will allow you to load the model in memory in 4bit or 8bit.
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, BitsAndBytesConfig

In [3]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /roo

In [4]:
# save the pretrained model from huggingface
model_id = "meta-llama/Llama-2-7b-chat-hf"

# the quantization method from bitsandbytes config.
config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)


# Load the pretrained model from huggingface and apply the quantization method from bitsandbytes config.
model = LlamaForCausalLM.from_pretrained(model_id,
                                         load_in_4bit=True,
                                         torch_dtype=torch.float16,
                                         quantization_config=config,
                                         device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model.device

device(type='cuda', index=0)

In [6]:
# load the pretrained tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_id)

In [7]:
# Add your question.
custom_prompt = "What are the touristic places i could visit in Mexico?"

In [8]:
PROMPT =f'''Below is an instruction that describes a task. Write a short response that appropriately completes the request.


### Instruction:
{custom_prompt}
### Response:
'''
print("prompt has been initialised as:\n", PROMPT)

prompt has been initialised as:
 Below is an instruction that describes a task. Write a short response that appropriately completes the request.


### Instruction:
What are the touristic places i could visit in Mexico?
### Response:



In [9]:
# The Generation function.
%%time

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
print(input_ids)
print("now printing inputs instead:\n")
print(inputs)

tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
         14350,   263,  3273,  2933,   393,  7128,  2486,  1614,  2167,   278,
          2009, 29889,    13,    13,    13,  2277, 29937,  2799,  4080, 29901,
            13,  5618,   526,   278,  6282,  4695,  7600,   474,  1033,  6493,
           297, 12568, 29973,    13,  2277, 29937, 13291, 29901,    13]],
       device='cuda:0')
now printing inputs instead:

{'input_ids': tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
         14350,   263,  3273,  2933,   393,  7128,  2486,  1614,  2167,   278,
          2009, 29889,    13,    13,    13,  2277, 29937,  2799,  4080, 29901,
            13,  5618,   526,   278,  6282,  4695,  7600,   474,  1033,  6493,
           297, 12568, 29973,    13,  2277, 29937, 13291, 29901,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [10]:
generation_config = GenerationConfig(
    temperature=1.0,
    top_p=1.0,
    typical_p=1.0,
    repetition_penalty=1.0,
    encoder_repetition_penalty=1.0,
    top_k=40,
    # max_length=100
    # renormalize_logits=True,
    # do_sample=True,
    # num_beams=2,
    # num_return_sequences=1,
    # remove_invalid_values=True
)
print("GENERATION CONFIG INITIALIZED...")


GENERATION CONFIG INITIALIZED...


In [11]:
generation_output = model.generate(
    input_ids=input_ids,
    generation_config=generation_config,
    return_dict_in_generate=True,
    output_scores=False,
    max_new_tokens=256,
)

print(generation_output)
for s in generation_output.sequences:
    print(tokenizer.decode(s))

GreedySearchDecoderOnlyOutput(sequences=tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
         14350,   263,  3273,  2933,   393,  7128,  2486,  1614,  2167,   278,
          2009, 29889,    13,    13,    13,  2277, 29937,  2799,  4080, 29901,
            13,  5618,   526,   278,  6282,  4695,  7600,   474,  1033,  6493,
           297, 12568, 29973,    13,  2277, 29937, 13291, 29901,    13, 29924,
           735,  1417,   338,   263,  4234,   411,   263,  8261, 16375,   902,
         16639,   322,   263, 16984,  1737,  5275, 29892, 27032,   263,  9377,
          3464,   310,  6282,  4695,  7600,   304,  6493, 29889,  3834,   310,
           278,  1556,  5972, 15422,   800,  3160,   278, 12297,  2610,   273,
          5796,  1144,   310, 27415,   398,   322,   678, 14487,   739,  1362,
         29892,   278,   325,  4626,   424,  4272,   310, 12568,  4412,   411,
           967, 22879,  4818,   322,  3186, 29899,  1990, 19133, 29879, 29892,
           3

In [12]:
custom_prompt = "Out of these attractions that you listed out for Mexico, which are the most peaceful ones, meaning less crowded ones?"

In [13]:
PROMPT =f'''Below is an instruction that describes a task. Write a short response that appropriately completes the request.


### Instruction:
{custom_prompt}
### Response:
'''
print("prompt has been initialised as:\n", PROMPT)

prompt has been initialised as:
 Below is an instruction that describes a task. Write a short response that appropriately completes the request.


### Instruction:
Out of these attractions that you listed out for Mexico, which are the most peaceful ones, meaning less crowded ones?
### Response:



In [14]:
# The Generation function.
%%time

inputs = tokenizer(
    PROMPT,
    return_tensors="pt",
)
input_ids = inputs["input_ids"].cuda()
print(input_ids)
print("now printing inputs instead:\n")
print(inputs)

tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
         14350,   263,  3273,  2933,   393,  7128,  2486,  1614,  2167,   278,
          2009, 29889,    13,    13,    13,  2277, 29937,  2799,  4080, 29901,
            13,  3744,   310,  1438, 19650,  1953,   393,   366,  9904,   714,
           363, 12568, 29892,   607,   526,   278,  1556, 10776,  1319,  6743,
         29892,  6593,  3109, 11660,  7176,  6743, 29973,    13,  2277, 29937,
         13291, 29901,    13]], device='cuda:0')
now printing inputs instead:

{'input_ids': tensor([[    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
         14350,   263,  3273,  2933,   393,  7128,  2486,  1614,  2167,   278,
          2009, 29889,    13,    13,    13,  2277, 29937,  2799,  4080, 29901,
            13,  3744,   310,  1438, 19650,  1953,   393,   366,  9904,   714,
           363, 12568, 29892,   607,   526,   278,  1556, 10776,  1319,  6743,
         29892,  6593,  3109, 11660,  

In [16]:
generation_output = model.generate(
    input_ids=input_ids,
    generation_config=generation_config,
    return_dict_in_generate=True,
    output_scores=False,
    max_new_tokens=512,
)


for s in generation_output.sequences:
    print(tokenizer.decode(s))

<s> Below is an instruction that describes a task. Write a short response that appropriately completes the request.


### Instruction:
Out of these attractions that you listed out for Mexico, which are the most peaceful ones, meaning less crowded ones?
### Response:
Based on my research, the most peaceful and less crowded attractions in Mexico are the ancient Mayan ruins of Tulum and Coba. Both of these sites offer a serene and tranquil atmosphere, with fewer tourists compared to more popular destinations like Cancun and Playa del Carmen. Tulum is located on the Caribbean coast and features a stunning beach and well-preserved ruins, while Coba is a remote jungle site with a vast network of ancient roads and temples. Both of these destinations offer a more authentic and less crowded experience for travelers looking to explore Mexico's rich cultural heritage.</s>
