In [1]:
# --- Imports ---
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline
import torch
import json
import os


  from .autonotebook import tqdm as notebook_tqdm


### Debug

In [None]:
pdf_path = "C:/wajahat/personal/learning/AI_agents_service_providers/stove.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load()

# print(f"Loaded {len(documents)} pages")
# print(documents[0].page_content[:500])

Loaded 7 pages
THE SCIENCE 
BEHIND HEAT
Large Electric Stove with 
Flame Effect 
Instruction Manual


In [9]:
print(documents[5].page_content[:50])




In [None]:
import fitz  # PyMuPDF
from tqdm.auto import tqdm
from typing import List, Dict

def extract_text_from_pdf(text: str) -> str:

    cleaned_text = text.replace('/n', " ").strip()    

    return cleaned_text

def open_read_pdf(pdf_path: str) -> List[Dict]:
    doc = fitz.open(pdf_path)
    pages_and_text = []
    for page_num, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = extract_text_from_pdf(text=text)
        pages_and_text.append({ "page_num": page_num,
                               "page_char_count": len(text),
                               "page_word_count": len(text.split(" ")),
                               "page_sentences_count": len(text.split(". ")),
                               "page_tokens_count": len(text) / 4,
                                 "text": text })
        
    return pages_and_text

pages_and_text = open_read_pdf(pdf_path=pdf_path)
pages_and_text[:5]

### Load Docs 

In [1]:
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import docx2txt

docs_path = "C:/Users/LT/Downloads/BEL0197_FAQs (1).docx"

# loader = UnstructuredWordDocumentLoader(docs_path)
# docs_load = loader.load()

text = docx2txt.process(docs_path)

### Load PDF

In [None]:
pdf_path = "C:/wajahat/personal/learning/AI_agents_service_providers/stove.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load()

### Vector Database

In [None]:

chroma_dir = "C:/wajahat/personal/learning/AI_agents_service_providers/chroma_db/stitching"                 # Folder to persist embeddings
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

# === LOAD & SPLIT DOCUMENT ===

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# docs = splitter.split_documents(text)     # to parse from the pdf
docs = splitter.split_text(text)            # to parse form the docs file

# === EMBEDDINGS ===
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

# === VECTOR STORE (Chroma) ===
# vectorstore = Chroma.from_documents(docs, embedding_model, persist_directory=chroma_dir)
vectorstore = Chroma.from_texts(docs, embedding_model, persist_directory=chroma_dir)
vectorstore.persist()

print(f"✅ Stored {len(docs)} chunks into Chroma DB at: {chroma_dir}")


### Debuging

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig
import torch
import ollama

#1. Create a quantization config
quantization_config = BitsAndBytesConfig( load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else "sdpa"

#2. Model Id
# model_id = "openai-community/gpt2"
model_id = "Qwen/Qwen2.5-1.5B-Instruct"
# model_path = "ollama/llama2"
# client = ollama.Client()
# model_path = llama2

#3. Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

#4. Load the model
model = AutoModelForCausalLM.from_pretrained(model_id, 
                                               quantization_config=quantization_config,
                                               attn_implementation=attn_implementation,
                                               low_cpu_mem_usage=False,)

### Load LLM

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

chroma_dir = "C:/wajahat/personal/learning/AI_agents_service_providers/chroma_db/stitching"

# === CONFIG ===
model_path = "ibm-granite/granite-4.0-micro"  # or "mistralai/Mistral-7B-v0.1"
# model_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # or "mistralai/Mistral-7B-v0.1"

# === LOAD EMBEDDINGS & VECTORSTORE ===
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
vectorstore = Chroma(persist_directory=chroma_dir, embedding_function=embedding_model)

# === LOAD MODEL ===
# device = 0 if torch.cuda.is_available() else -1

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)

generator = pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    tokenizer= tokenizer,
    # device=device,
    max_new_tokens=512,
    temperature=0.1,
)
llm = HuggingFacePipeline(pipeline=generator)

# # === CREATE RETRIEVAL CHAIN ===
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=False,
)

print("✅ RAG pipeline ready!")


  vectorstore = Chroma(persist_directory=chroma_dir, embedding_function=embedding_model)
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.58s/it]
Device set to use cuda:0


✅ RAG pipeline ready!


  llm = HuggingFacePipeline(pipeline=generator)


### Inference Pipeline

In [5]:
from sentence_transformers import SentenceTransformer, util

# === CONFIG ===

model = f"{model_path.replace('/', '_')}"
quries = "stiching"
queries_path = f"{quries}.json"       # JSON file containing { "questions": [...], "answers": [...] }
output_folder = "results/llm_rag_responses"
output = f"{model}_{quries}_responses.txt"
output_file = os.path.join(output_folder,output)

# os.makedirs(output_file, exist_ok=True)
# Example format for queries.json:
# {
#   "questions": [
#       "What is the main objective of the project?",
#       "How long is the duration?"
#   ],
#   "answers": [
#       "The main objective is to monitor and analyze energy usage patterns.",
#       "The duration is one year."
#   ]
# }

# === LOAD QUESTIONS & ANSWERS ===
with open(queries_path, "r", encoding="utf-8") as f:
    data = json.load(f)

questions = data["questions"]
ground_truths = data["answers"]

# === EMBEDDING MODEL FOR SIMILARITY ===
sim_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# === RUN INFERENCE LOOP ===
results = []
for i, (q, ref) in enumerate(zip(questions, ground_truths)):
    print(f"\n[{i+1}/{len(questions)}] Question: {q}")
    model_answer = qa_chain.run(q)

    # Compute similarity
    emb_ref = sim_model.encode(ref, convert_to_tensor=True)
    emb_ans = sim_model.encode(model_answer, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(emb_ref, emb_ans).item()

    # Save to list
    results.append({
        "question": q,
        "reference_answer": ref,
        "model_answer": model_answer,
        "similarity": similarity
    })

# === SAVE TO TXT ===
with open(output_file, "w", encoding="utf-8") as f:
    for r in results:
        f.write(f"Question: {r['question']}\n")
        f.write(f"Reference: {r['reference_answer']}\n")
        f.write(f"Model: {r['model_answer']}\n")
        f.write(f"Similarity: {r['similarity']:.4f}\n")
        f.write("="*80 + "\n")

print(f"\n✅ Done! Responses and similarities saved in {output_file}")



[1/10] Question: How do I thread the Beldray BEL0197 sewing machine?


  model_answer = qa_chain.run(q)



[2/10] Question: What should I do if the stitches are skipping?

[3/10] Question: How can I adjust the tension on my sewing machine?

[4/10] Question: Why is the sewing machine making a loud noise?

[5/10] Question:  How do I change the presser foot on the BEL0197?

[6/10] Question: What steps should I follow to clean the sewing machine?

[7/10] Question: How can I fix the bobbin winding issue?

[8/10] Question: Why does the fabric bunch up under the needle?

[9/10] Question: What causes the needle to break frequently?

[10/10] Question: How do I select different stitch patterns on the BEL0197?

✅ Done! Responses and similarities saved in results/llm_rag_responses\ibm-granite_granite-4.0-micro_stiching_responses.txt


### Complete Pipeline

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

quries = "stove"  #change
chroma_dir = f"C:/wajahat/personal/learning/AI_agents_service_providers/chroma_db/{quries}" #change
temperature = 0.1 #change

# === CONFIG ===
model_path = "meta-llama/Llama-3.2-1B-Instruct"# "mistralai/Mistral-7B-v0.1" # "microsoft/phi-2"  # "ibm-granite/granite-4.0-micro" # change

# === LOAD EMBEDDINGS & VECTORSTORE ===
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
vectorstore = Chroma(persist_directory=chroma_dir, embedding_function=embedding_model)


tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16)

generator = pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    tokenizer= tokenizer,
    # device=device,
    max_new_tokens=512,
    temperature=temperature,
)
llm = HuggingFacePipeline(pipeline=generator)

# # === CREATE RETRIEVAL CHAIN ===
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=False,
)

print("✅ RAG pipeline ready!")

from sentence_transformers import SentenceTransformer, util

# === CONFIG ===

model = f"{model_path.replace('/', '_')}"
queries_path = f"{quries}.json"       # JSON file containing { "questions": [...], "answers": [...] }
output_folder = "results/llm_rag_responses"
output = f"{model}_{quries}_responses_t{temperature}.txt"
output_file = os.path.join(output_folder,output)

with open(queries_path, "r", encoding="utf-8") as f:
    data = json.load(f)

questions = data["questions"]
ground_truths = data["answers"]

# === EMBEDDING MODEL FOR SIMILARITY ===
sim_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# === RUN INFERENCE LOOP ===
results = []
for i, (q, ref) in enumerate(zip(questions, ground_truths)):
    print(f"\n[{i+1}/{len(questions)}] Question: {q}")
    model_answer = qa_chain.run(q)

    # Compute similarity
    emb_ref = sim_model.encode(ref, convert_to_tensor=True)
    emb_ans = sim_model.encode(model_answer, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(emb_ref, emb_ans).item()

    # Save to list
    results.append({
        "question": q,
        "reference_answer": ref,
        "model_answer": model_answer,
        "similarity": similarity
    })

# === SAVE TO TXT ===
with open(output_file, "w", encoding="utf-8") as f:
    for r in results:
        f.write(f"Question: {r['question']}\n")
        f.write(f"Reference: {r['reference_answer']}\n")
        f.write(f"Model: {r['model_answer']}\n")
        f.write(f"Similarity: {r['similarity']:.4f}\n")
        f.write("="*80 + "\n")

print(f"\n✅ Done! Responses and similarities saved in {output_file}")

  embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
  vectorstore = Chroma(persist_directory=chroma_dir, embedding_function=embedding_model)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct.
403 Client Error. (Request ID: Root=1-6937ccbd-490782fa627215b93ef2114e;aac05187-9cb9-447d-8b89-ed765f54a14a)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-1B-Instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct to ask for access.