### `Import`

In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, \
    BitsAndBytesConfig, GenerationConfig
from accelerate.test_utils.testing import get_backend
from langchain_huggingface.llms import HuggingFacePipeline

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

## `Way1: through pipeline method`

### `Loading and Setting Model & Tokenizer`

In [3]:
# Automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
device, _, _ = get_backend()
# Define the base model directory and model ID
MODEL_DIR = '/home/zerothweek/llm/models'
model_id = 'Llama-3.2-1B-Instruct'
# Construct the full path to the model
model_path = os.path.join(MODEL_DIR, model_id)


quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_comput_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto",
    torch_dtype=torch.float16,
    #quantization_config = quantization_config,
    trust_remote_code=False,

)
# Loading and Setting tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_path, 
    #padding_side="left", #!Don't need a padding_side since it's a single input
)
tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default

### `Making a HuggingFacePipeline`

In [None]:
#!We set all the parameters that used to belong from "generation_config"
#!TODO: can't find where to put the parameters that belong to tokenizer() eg. add_special_tokens, padding, truncation etc..

pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    
    max_new_tokens=200, 
    do_sample=True,
    bos_token_id=[128000], 
    eos_token_id=[128001, 128008, 128009], 
    temperature=0.6, 
    top_p=0.9)
llm = HuggingFacePipeline(pipeline=pipe, batch_size=4)#!TODO: need to check where the batch size method belongs

Device set to use cpu


### `making Template & Prompt`

In [5]:
template = \
'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 28 Mar 2025

You are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>'''
prompt = PromptTemplate.from_template(template)

### ` defining the chain`

In [1]:
chain.invoke('hi').content

NameError: name 'chain' is not defined

## `Way2: `

### `Settings`

In [10]:
# Automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
device, _, _ = get_backend()
# Define the base model directory and model ID
MODEL_DIR = '/home/zerothweek/llm/models'
model_id = 'Llama-3.2-1B-Instruct'
# Construct the full path to the model
model_path = os.path.join(MODEL_DIR, model_id)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_comput_dtype=torch.bfloat16
)


In [11]:

model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="auto",
    torch_dtype=torch.float16,
    #quantization_config = quantization_config,
    trust_remote_code=False,

)
# Loading and Setting tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_path, 
    padding_side="left", # Actually Don't need a padding_side since it's a single input
)
tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default

In [12]:
tokenizer.eos_token_id

128009

In [13]:
llm = HuggingFacePipeline.from_model_id(
    model_id=model_path,
    task="text-generation",
    #back_end=
    #device=
    device_map="auto",  
    batch_size=4,

    #!MOB: handles the parmeters that belong to model
    model_kwargs=dict(
        # model parameters(excep model_id and device_map)
        trust_remote_code=False,
        torch_dtype=torch.float16,
            #quantization_config = quantization_config,

    ),
    #!TODO: can't handle parameters used when initializing a tokenizer eg. padding_side=True(THIS WASN'T A PROBLEM FOR WAY1)
    #!TODO: can't find where to put the parameters that belong to tokenizer() eg. add_special_tokens, padding, truncation etc..(THIS WAS A PROBLEM ALSO FOR WAY1)
    # generation parameters use to belong in the generation_config file from the basic pipeline 
    pipeline_kwargs=dict(
        max_new_tokens=200, 
        do_sample=True,
        bos_token_id=[128000], 
        eos_token_id=[128001, 128008, 128009], 
        temperature=0.6, 
        top_p=0.9,
            #repetition_penalty=1.03,
    ),
    
)


Device set to use cpu


In [14]:
chain = prompt | llm

In [15]:
chain.invoke('hi')

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 28 Mar 2025\n\nYou are a pirate chatbot who always responds in pirate speak!<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nhi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nArrr, ye be wantin' to have a swashbucklin' conversation, eh? Alright then, matey! What be bringin' ye to these fair waters? Treasure huntin', or just lookin' fer a good tale to tell?"

<b>MANUALLY FIXED A PROBLEM FROM. 
https://github.com/langchain-ai/langchain/commit/aeb42dc9004cd650fe348ffa78f6f3a0a8a6f15e