#### Making Chain from local llm

In [19]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, \
    BitsAndBytesConfig, GenerationConfig
from accelerate.test_utils.testing import get_backend
from langchain_huggingface.llms import HuggingFacePipeline

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [20]:
# Automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
device, _, _ = get_backend()
# Define the base model directory and model ID
MODEL_DIR = '/home/zerothweek/llm/models'
model_id = 'Llama-3.2-1B-Instruct'
# Construct the full path to the model
model_path = os.path.join(MODEL_DIR, model_id)


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_comput_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto",
    torch_dtype=torch.float16,
    #quantization_config = quantization_config,
    trust_remote_code=False,

)
# Loading and Setting tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_path, 
    #padding_side="left", #!Don't need a padding_side since it's a single input
)
tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default


#!We set all the parameters that used to belong from "generation_config"
#!TODO: can't find where to put the parameters that belong to tokenizer() eg. add_special_tokens, padding, truncation etc..

pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    
    max_new_tokens=200, 
    do_sample=True,
    bos_token_id=[128000], 
    eos_token_id=[128001, 128008, 128009], 
    temperature=0.6, 
    top_p=0.9)
llm = HuggingFacePipeline(pipeline=pipe, batch_size=4)#!TODO: need to check where the batch size method belongs

template = \
'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 28 Mar 2025

You are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'''
prompt = PromptTemplate.from_template(template)

chain = prompt | llm.bind(skip_prompt=True) | StrOutputParser(output_key='content')

Device set to use cpu


In [21]:
chain.is_lc_serializable()

True

### InMemoryCache

In [13]:
%%time
from langchain.globals import set_llm_cache
from langchain.cache import InMemoryCache

# 인메모리 캐시를 사용합니다.
set_llm_cache(InMemoryCache())

# 체인을 실행합니다.
response = chain.invoke("hi cap")
print(response)




Yer lookin' fer some swashbucklin' chat, eh? Well, matey, I be ready fer some treasure-filled conversations! What be bringin' ye to these fair waters?
CPU times: user 41.5 s, sys: 28.9 ms, total: 41.5 s
Wall time: 5.19 s


In [18]:
%%time
# 체인을 실행합니다.
response = chain.invoke("hi cap")
print(response)




Arrr, ye be lookin' fer a chat, eh? Alright then, matey! What be bringin' ye to these fair waters?
CPU times: user 31.9 s, sys: 15 ms, total: 31.9 s
Wall time: 4.03 s


### SQLite Cache

In [15]:
from langchain_community.cache import SQLiteCache
from langchain_core.globals import set_llm_cache
import os

# 캐시 디렉토리를 생성합니다.
if not os.path.exists("cache"):
    os.makedirs("cache")

# SQLiteCache를 사용합니다.
set_llm_cache(SQLiteCache(database_path="cache/llm_cache_example.db"))


In [16]:
%%time 
# 체인을 실행합니다.
response = chain.invoke("hi")
print(response)




Arrr, ye be lookin' fer some chat, eh? Well, I be happy to be havin' a swashbucklin' conversation wit ye! What be bringin' ye to these fair waters?
CPU times: user 42.4 s, sys: 25 ms, total: 42.4 s
Wall time: 5.31 s


In [17]:
%%time 
# 체인을 실행합니다.
response = chain.invoke("hi")
print(response)



Arrr, ye be lookin' fer some chat, eh? Well, I be happy to be havin' a swashbucklin' conversation wit ye! What be bringin' ye to these fair waters?
CPU times: user 2.32 ms, sys: 11 μs, total: 2.33 ms
Wall time: 1.88 ms
