In [1]:
from model_connection import *

In [3]:
model = get_model("qwen2.5-coder-14b-instruct")

In [7]:
from datasets import load_dataset
from pathlib import Path

dataset = load_dataset(path="dvilasuero/finepersonas-v0.1-tiny", split="train")

Path("data").mkdir(parents=True, exist_ok=True)
for i, persona in enumerate(dataset):
    with open(Path("data") / f"persona_{i}.txt", "w") as f:
        f.write(persona["persona"])

README.md:   0%|          | 0.00/618 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/35.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [8]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir="data")
documents = reader.load_data()
len(documents)

5000

In [9]:
from llama_index.core import Document
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_overlap=0),
        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    ]
)

nodes = await pipeline.arun(documents=documents[:10])
nodes

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[TextNode(id_='0b451584-da6c-4cb3-87de-84855e019e6e', embedding=[-0.03748531639575958, 0.00568846520036459, 0.02225431241095066, -0.0030931083019822836, 0.0032950674649327993, -0.016623185947537422, 0.005074694287031889, -0.01685536839067936, -0.06162302568554878, -0.05901915207505226, -0.013532478362321854, -0.02623092010617256, -0.03660596162080765, 0.03902430459856987, 0.00803881511092186, -0.015227528288960457, -0.018760960549116135, 0.08722469955682755, 0.02620762772858143, -0.0029367434326559305, -0.0015396439703181386, -0.07997753471136093, 0.0482991486787796, -0.05014357715845108, -0.017223769798874855, 0.02913813106715679, 0.03335736691951752, -0.008212074637413025, 0.019969943910837173, -0.12402865290641785, -0.05826646462082863, 0.014211226254701614, -0.022131800651550293, 0.0015001685824245214, 0.07136521488428116, 0.01773827336728573, -0.00013314653187990189, 0.05202193930745125, 0.004050330258905888, 0.018365373834967613, 0.003953855484724045, 0.014227177016437054, -0.036

In [10]:
!pip install llama-index-vector-stores-chroma -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

db = chromadb.PersistentClient(path="./alfred_chroma_db")
chroma_collection = db.get_or_create_collection("alfred")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

pipeline = IngestionPipeline(
    transformations=[
        # SentenceSplitter(chunk_size=25, chunk_overlap=0),
        SentenceSplitter(),
        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    ],
    vector_store=vector_store,
)

In [14]:
nodes = await pipeline.arun(documents=documents[:10])
len(nodes)

10

In [15]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, embed_model=embed_model
)

Types of Response mode

refine: create and refine an answer by sequentially going through each retrieved text chunk. This makes a separate LLM call per Node/retrieved chunk.

compact (default): similar to refining but concatenating the chunks beforehand, resulting in fewer LLM calls.

tree_summarize: create a detailed answer by going through each retrieved text chunk and creating a tree structure of the answer.

In [21]:
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
import nest_asyncio

nest_asyncio.apply()  # This is needed to run the query engine
llm = HuggingFaceInferenceAPI(
    # model_name="Qwen/Qwen2.5-Coder-32B-Instruct"
    model="http://127.0.0.1:1234/v1",
    )
query_engine = index.as_query_engine(
    llm=llm,
    response_mode="tree_summarize",
)
response = query_engine.query(
    "Respond using a persona that describes author and travel experiences?"
)
response

Response(response='I am an anthropologist with a deep interest in Cypriot culture, history, and society. My research has been extensive, involving significant time spent living in Cyprus to fully understand its people, customs, and way of life. This immersive approach allows me to provide insights into the rich tapestry of Cypriot traditions and societal dynamics.', source_nodes=[NodeWithScore(node=TextNode(id_='5bf41f81-bcc3-4ba8-ae25-9adf78b2566e', embedding=None, metadata={'file_path': '/Users/sunilthapa/Desktop/projects/Huggingface/data/persona_1.txt', 'file_name': 'persona_1.txt', 'file_type': 'text/plain', 'file_size': 266, 'creation_date': '2025-07-23', 'last_modified_date': '2025-07-23'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.S

In [22]:
from llama_index.core.evaluation import FaithfulnessEvaluator

# query index
evaluator = FaithfulnessEvaluator(llm=llm)
eval_result = evaluator.evaluate_response(response=response)
eval_result.passing

True

In [None]:
import llama_index
import os

PHOENIX_API_KEY = "<PHOENIX_API_KEY>"
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"api_key={PHOENIX_API_KEY}"
llama_index.core.set_global_handler(
    "arize_phoenix", endpoint="https://llamatrace.com/v1/traces"
)

In [None]:
response = query_engine.query(
    "What is the name of the someone that is interested in AI and techhnology?"
)
response