In [29]:
# Import necessary modules
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter
from PyPDF2 import PdfReader
from langchain.docstore.document import Document
import os
import threading
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [30]:
data_dir = '/'
chroma_db = 'vectors/'
model_name = "Qwen/Qwen3-0.6B"

In [31]:


# Check path
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
if not os.path.exists(chroma_db):
    os.makedirs(chroma_db)
# Create a DirectoryLoader instance to load PDF documents
documents = []
for filename in os.listdir(data_dir):
    if filename.endswith(".pdf"):
        filepath = os.path.join(data_dir, filename)
        try:
            reader = PdfReader(filepath)
            full_text = ''
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text
            documents.append(Document(page_content=full_text, metadata={"source": filename}))
        except Exception as e:
            print(f"Error reading {filename}: {e}")
print('.....document_loaded.....')

# Initialize a text splitter to divide documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", " ", ""]
)
print('.....document_splitter.....')

texts = text_splitter.split_documents(documents)
print('.....document_splitted.....')

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L12-v2',
                                  model_kwargs={'device': 'cpu'})
print('.....document_embedded.....')

db = Chroma.from_documents(texts, embeddings,persist_directory=chroma_db)
print('.....document_loaded_at_db.....')

.....document_loaded.....
.....document_splitter.....
.....document_splitted.....
.....document_embedded.....
.....document_loaded_at_db.....


In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32).to("cpu")

In [33]:
question = input("Input question to LLM:")

In [34]:
results = db.similarity_search(question, k=3)
context = " ".join(doc.page_content for doc in results)


In [35]:
SYSTEM_PROMPT = """
You are an AI assistant in Astree and static testing. You are able to find answers to the questions from the contextual passage snippets provided and from your knowledge.
"""

USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.

<context>
{context}
</context>

<question>
{question}
</question>
"""


In [36]:
prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{USER_PROMPT}<|im_end|>\n<|im_start|>assistant\n"
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking= True # Switches between thinking and non-thinking modes. Default is True.
)

In [37]:
inputs = tokenizer(text, return_tensors="pt").to("cpu")
generated_ids = model.generate(
    **inputs,
    max_new_tokens=1200,
    do_sample=True,
    temperature=0.8,
    top_p=0.95,
    eos_token_id=tokenizer.eos_token_id
)

output_ids = generated_ids[0][len(inputs.input_ids[0]):].tolist() 
# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("")

print("thinking content:", thinking_content)
print("content:", content)

thinking content: <think>
Okay, let's see. The user asked about "overflow deference" and I need to use the provided context to answer. 

Looking at the context, there's a part where it mentions that in earlier versions of Astrée, overflow deference was handled by case analysis, which involved unfolding the array or partitioning the loop. More recently, there's a symbolic memory predicate domain that automatically resolves this without those methods. 

The question is about overflow deference, so I need to connect the context to this. The key point here is that earlier versions required specific handling, but newer versions use a symbolic domain to avoid it. The answer should mention that the symbolic domain solves the issue without manual handling, which is the overflow deference.
</think>
content: 

The overflow deference in Astrée is resolved by the symbolic memory predicate domain in more recent versions, which automatically handles the issue without requiring manual case analysis o