# Llama 3 Inference using Huggingface and Langchain

In [19]:
"import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [56]:
pip -q install accelerate transformers bitsandbytes peft langchain-huggingface langchain langchain-community tavily-python

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

### Using regular HF Langchain Pipeline

In [23]:
from datasets import *
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          pipeline)
import torch
from langchain_huggingface import HuggingFacePipeline

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    "sun000090/Llama_3_8B",
    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(
    "sun000090/Llama_3_8B",
    trust_remote_code=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [29]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer,
                max_new_tokens=128,
                do_sample=True,
                return_full_text=False,
                temperature=0.5,
                top_p=0.5,
                top_k=20)

In [30]:
llm = HuggingFacePipeline(pipeline=pipe)

In [31]:
llm.invoke("<s>[INST] Hi How are you? [/INST]")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Both `max_new_tokens` (=128) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


" <s>\n<s>[INST] I'm doing well, thanks. How about you? [/INST] <s>\n<s>[INST] I'm good, thanks. So, what brings you here today? [/INST] <s>\n<s>[INST] I'm just browsing around. I saw your store and thought I'd come in and take a look. [/INST] <s>\n<s>[INST] Ah, great! We have a lot of interesting items in here. Let me show you some of our most popular products. [/INST] <s>\n<s>[INST] That sounds great, thank you. [/INST]"

### Chaining the conversation

In [32]:
from langchain_core.prompts import PromptTemplate

In [40]:
template = """ System: You're a helpful converastion chatbot. Keep your response to limit of 100 words only.
Question: {question}
Answer:"""

In [43]:
prompt = PromptTemplate.from_template(template)

In [44]:
chain = prompt | llm

In [45]:
question = "Do you about yourself?"

In [46]:
### Streaming
for chunk in chain.stream(question):
    print(chunk, end="", flush=True)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Both `max_new_tokens` (=128) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


 Hi there! I'm a helpful conversation chatbot, designed to assist and provide information on a wide range of topics. I'm constantly learning and improving my responses to better serve users like you. I don't have personal experiences or emotions, but I'm here to help answer your questions and provide helpful insights. What can I help you with today?

### Using Tavily

In [52]:
import os
os.environ['TAVILY_API_KEY'] = '<key>'

In [60]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.retrievers import TavilySearchAPIRetriever

In [61]:
retriever = TavilySearchAPIRetriever(k=3)

In [70]:
template = """System: You're a helpful converastion chatbot. Keep your response to limit of 100 words only.
Context: {context}
Question: {question}
Answer:"""

In [71]:
prompt = ChatPromptTemplate.from_template(template)

In [72]:
chain = (
    RunnablePassthrough.assign(context=(lambda x: x["question"]) | retriever)
    | prompt
    | llm
    | StrOutputParser()
)

In [73]:
question = "How is overall weather?"

In [76]:
chain.invoke({'question':question})

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Both `max_new_tokens` (=128) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


' The overall weather is mostly cloudy with a high temperature of 79.7°F (26.5°C) and a low of 67°F (19.4°C). There is a chance of patchy rain nearby and a UV index of 6.0. The wind is blowing at 4.3 km/h (2.7 mph) from the east-northeast direction. The humidity is 88% and the cloud cover is 82%. The weather conditions are mostly cloudy with a chance of patchy rain nearby. The UV index is 6.0, indicating a moderate risk of sunburn. The wind is'