In [1]:
!pip list | grep torch

In [None]:
%pip install gradio
%pip install llama-index-llms-huggingface
%pip install llama-index-llms-huggingface-api
%pip install --upgrade --quiet llama-index-llms-nvidia llama-index-embeddings-nvidia llama-index-readers-file

Collecting llama-index-llms-huggingface
  Downloading llama_index_llms_huggingface-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<0.24.0,>=0.23.0 (from llama-index-llms-huggingface)
  Downloading huggingface_hub-0.23.5-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-core<0.12.0,>=0.11.0 (from llama-index-llms-huggingface)
  Downloading llama_index_core-0.11.22-py3-none-any.whl.metadata (2.4 kB)
Collecting text-generation<0.8.0,>=0.7.0 (from llama-index-llms-huggingface)
  Downloading text_generation-0.7.0-py3-none-any.whl.metadata (8.5 kB)
Collecting torch<3.0.0,>=2.1.2 (from llama-index-llms-huggingface)
  Downloading torch-2.5.1-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting transformers<5.0.0,>=4.37.0 (from transformers[torch]<5.0.0,>=4.37.0->llama-index-llms-huggingface)
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from huggingface-hub<0.24.0,>=0.23.0->llama-index-llms-huggingface)
  Downloadin

In [None]:
!pip install "transformers[torch]" "huggingface_hub[inference]"

In [None]:
!pip install llama-index

In [None]:
hf_token = ""

import getpass
import os

# del os.environ['HF_TOKEN']  ## delete key and reset
if os.environ.get("HF_TOKEN", "").startswith("hf_"):
    print("Valid HF_TOKEN already in environment. Delete to reset")
else:
    hf_token = getpass.getpass("HF TOKEN (starts with hf_): ")
    assert hf_token.startswith(
        "hf_"
    ), f"{hf_token[:5]}... is not a valid key"
    os.environ["HF_TOKEN"] = hf_token

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    token=os.environ.get("HF_TOKEN", ""),
)

stopping_ids = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]

In [None]:
# generate_kwargs parameters are taken from https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct

import torch
from llama_index.llms.huggingface import HuggingFaceLLM

# Optional quantization to 4bit
# import torch
# from transformers import BitsAndBytesConfig

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )

local_llm = HuggingFaceLLM(
    model_name="meta-llama/Llama-3.1-8B-Instruct",
    model_kwargs={
        "token": hf_token,
        "torch_dtype": torch.bfloat16,  # comment this line and uncomment below to use 4bit
        # "quantization_config": quantization_config
    },
    generate_kwargs={
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
    },
    tokenizer_name="meta-llama/Llama-3.1-8B-Instruct",
    tokenizer_kwargs={"token": hf_token},
    stopping_ids=stopping_ids,
)

In [None]:
response = local_llm.complete("What is Dell Technologies?")

print(response)

In [None]:
content = ""
for completion in local_llm.stream_complete("What is Dell Technologies?"):
    content += completion.delta
    print(completion.delta, end="")

In [None]:
# import gradio as gr

# def stream_response_local(message, history):
#     response = local_llm.stream_complete(message)
#     res = ""
#     for token in response:
#         # print(token, end="")
#         res = str(res) + str(token.delta)
#         yield res

# with gr.Blocks() as demo1:
#     gr.Markdown(
#     """
#     <h1 style="text-align: center;">Local HuggingFaceLLM Chatbot 💻📑✨</h3>
#     """)
#     with gr.Row(equal_height=False):
#         with gr.Column():
#             test = gr.ChatInterface(fn=stream_response_local)
            

# # demo.launch(server_name="0.0.0.0", ssl_verify=False)
# demo1.launch()



In [None]:
import getpass
import os

# del os.environ['NVIDIA_API_KEY']  ## delete key and reset
if os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
    print("Valid NVIDIA_API_KEY already in environment. Delete to reset")
else:
    nvapi_key = getpass.getpass("NVAPI Key (starts with nvapi-): ")
    assert nvapi_key.startswith(
        "nvapi-"
    ), f"{nvapi_key[:5]}... is not a valid key"
    os.environ["NVIDIA_API_KEY"] = nvapi_key

In [None]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio

nest_asyncio.apply()

In [None]:
from llama_index.llms.nvidia import NVIDIA

# connect to an chat NIM running at localhost:8080, spcecifying a specific model
hosted_nim_llm = NVIDIA(
    model="meta/llama-3.1-8b-instruct"
)

content = ""
for completion in hosted_nim_llm.stream_complete("What is Dell Technologies?"):
    content += completion.delta
    print(completion.delta, end="")

In [None]:
from llama_index.llms.nvidia import NVIDIA

# connect to an chat NIM running at localhost:8080, spcecifying a specific model
local_nim_llm = NVIDIA(
    base_url="http://localhost:8000/v1", model="meta/llama-3.1-8b-instruct"
)

content = ""
for completion in local_nim_llm.stream_complete("What is Dell Technologies?"):
    content += completion.delta
    print(completion.delta, end="")