# Huggingface

## Get started
### Set up

In [None]:
%pip install huggingface_hub
# `huggingface-cli login` with hf write access token
%pip install transformers
%pip install jinja2

In [None]:
from huggingface_hub import HfApi, InferenceClient, AsyncInferenceClient
from transformers import AutoTokenizer
from pprint import pprint

### Manage a repo

In [None]:
api = HfApi()
api.create_repo("biaotang/test-dataset", repo_type="dataset", private=True)

In [None]:
refs = api.list_repo_refs("biaotang/test-dataset", repo_type="dataset")
pprint(refs)

### Inference

In [None]:
messages = [
  {
    "role": "user",
    "content": "What is the capital of France?",
  }
]

In [None]:
client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct")
output = client.chat_completion(messages, max_tokens=100)
pprint(output)

In [None]:
gemma = InferenceClient("google/gemma-3-27b-it")
gemma.chat_completion(messages)

### Async client

In [None]:
async_client = AsyncInferenceClient()
async for token in await async_client.text_generation("The Huggingface Hub is", stream=True):
  print(token, end="")

### Inference endpoints

In [None]:
from huggingface_hub import create_inference_endpoint
# create_inference_endpoint(...), or
# https://ui.endpoints.huggingface.co/new

### Messages to prompt

In [None]:
messages = [
    {"role": "system", "content": "You are an AI assistant with access to various tools."},
    {"role": "user", "content": "Hi !"},
    {"role": "assistant", "content": "Hi human, what can help you with ?"},
]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
rendered_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

pprint(rendered_prompt)

## Agent
### Smolagents

In [None]:
%pip install 'smolagents[transformers,litellm]'
%pip install -U 'google-cloud-aiplatform>=1.38'

In [None]:
from huggingface_hub import InferenceClient

# run: llama-server -m model.gguf --port 8080
client = InferenceClient(model="http://localhost:8080/v1/chat/completions")

def custom_model(messages, stop_sequences=["Task"]):
    response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
    answer = response.choices[0].message
    return answer

In [None]:
from smolagents import TransformersModel

model = TransformersModel(model_id="HuggingFaceTB/SmolLM-135M-Instruct")

print(model([{"role": "user", "content": [{"type": "text", "text": "Ok!"}]}], stop_sequences=["great"]))

In [None]:
from smolagents import HfApiModel

messages = [
  {"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]}
]

model = HfApiModel()
print(model(messages))

In [None]:
from dotenv import load_dotenv, find_dotenv
# Config gemini api key and vertex credential.
load_dotenv(find_dotenv())

In [None]:
from smolagents import LiteLLMModel

messages = [
  {"role": "user", "content": [{"type": "text", "text": "what is the capital of France?"}]}
]
# Gemini api access with api key, see .env file
model = LiteLLMModel(model_id="gemini/gemini-2.0-flash",
                     temperature=0.2, max_tokens=10,)
print(model(messages))

In [None]:
from smolagents import LiteLLMModel

messages = [
  {"role": "user", "content": [{"type": "text", "text": "what is the capital of France?"}]}
]
# Vertex api access with application default credentail (ADC)
model = LiteLLMModel(model_id="vertex_ai/gemini-2.0-flash",
                     temperature=0.2, max_tokens=10,)
print(model(messages))