In [3]:
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, GenerationConfig)
import torch

In [4]:
from dotenv import load_dotenv
from huggingface_hub import login
import os

load_dotenv()  # Loads from .env file

if token := os.getenv("HF_TOKEN"):
    login(token=token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Smaller models that fit more easily
model_name = "mistralai/Mistral-7B-Instruct-v0.1"  # Or try these alternatives:
# model_name = "microsoft/phi-2"                   # 2.7B params
# model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # 1.1B params
# model_name = "Qwen/Qwen2-1.5B"                   # 1.5B params

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [13]:
generation_config = GenerationConfig(
    num_beams=4,
    early_stopping=True,
    max_new_tokens=300,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

seed_sentence = "Explain the theory of relativity in simple terms."

model_inputs = tokenizer(
    [seed_sentence], return_tensors="pt").to(device)
generated_ids = model.generate(
    **model_inputs, generation_config=generation_config)
generated_text = tokenizer.batch_decode(
    generated_ids, skip_special_tokens=True)[0]
print(generated_text)

Explain the theory of relativity in simple terms.

The theory of relativity is a scientific explanation of how space and time are connected. It was first proposed by Albert Einstein in the early 20th century.

There are two parts to the theory of relativity: the special theory of relativity and the general theory of relativity.

The special theory of relativity states that the laws of physics are the same for all observers moving at a constant speed in a straight line. It also says that the speed of light is always the same, no matter how fast an observer is moving or how far away the light source is. This leads to some strange results, like time slowing down for objects moving at high speeds.

The general theory of relativity extends the special theory to include gravity. It says that massive objects cause a distortion in space-time, which is felt as gravity. This means that gravity is not a force between two objects, but rather a curvature of space-time caused by the presence of mass

In [15]:
import os
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model_name,
    device=0 if torch.cuda.is_available() else -1,
    tokenizer=tokenizer,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    num_beams=4,
    early_stopping=True,
    repetition_penalty=1.4,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [16]:
prompt = [
    {"role": "user", "content": "What's your favorite book?"},
    {"role": "assistant", "content": "I enjoy reading science fiction novels."},
    {"role": "user", "content": "Can you recommend one?"},
]
response = pipe(prompt)
print(response[0]["generated_text"])

[{'role': 'user', 'content': "What's your favorite book?"}, {'role': 'assistant', 'content': 'I enjoy reading science fiction novels.'}, {'role': 'user', 'content': 'Can you recommend one?'}, {'role': 'assistant', 'content': ' One of my favorite science fiction novels is "Dune" by Frank Herbert.'}]


In [22]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFacePipeline

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a great mentor."),
    ("user", "{input}"),
])

llm = HuggingFacePipeline(pipeline=pipe)
chain = prompt | llm | StrOutputParser()

response = chain.invoke({"input": "What is Python programming?"})
print(response)

System: You are a great mentor.
Human: What is Python programming?
AI: Python is a high-level, interpreted programming language that is widely used for web development, data analysis, artificial intelligence, and machine learning. It is known for its simplicity, readability, and ease of use, making it a popular choice for beginners and experienced programmers alike.


In [24]:
template = """Answer the question. Keep your answer under 10 words.

Question: {input}
Answer:"""
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | llm | StrOutputParser()
response = chain.invoke({"input": "Who is the president of the United States?"})
print(response)

Human: Answer the question. Keep your answer under 10 words.

Question: Who is the president of the United States?
Answer: Joe Biden
