In [2]:
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import TextStreamer
from unstructured.partition.pdf import partition_pdf

```mermaid
flowchart TD
    subgraph PDF_Processing
        A[Load PDF Document] --> B[Partition PDF]
        B --> C[Clean Text Chunks]
    end

    subgraph Vector_DB
        C --> D[Generate Embeddings]
        D --> E[Create FAISS Vector DB]
        E --> F[Save Index Locally]
    end

    subgraph Model_Setup
        G[Load Tokenizer] --> H[Load LLM with FP16]
        H --> I[Set Model Parameters]
    end

    subgraph RAG_Chain
        J[Define Prompt Template] --> K[Create Text Generation Pipeline]
        F --> L[Setup Vector DB Retriever]
        K --> M[Create HuggingFacePipeline]
        L & M & J --> N[Build RetrievalQA Chain]
    end

    subgraph Inference
        O[User Query] --> N
        N --> P[Context Retrieval]
        P --> Q[Generate Answer]
        Q --> R[Return Response]
    end

    PDF_Processing --> Vector_DB
    Vector_DB & Model_Setup --> RAG_Chain
    RAG_Chain --> Inference

# Processing PDF

In [3]:
file_path = 'dataset/PDF/resnet.pdf'

# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,
    strategy="hi_res",
    extract_image_block_types=["Image"],
    extract_image_block_to_payload=True,
    chunking_strategy="by_title",
    max_characters=10000,
    combine_text_under_n_chars=2000,
    new_after_n_chars=6000,
    extract_images_in_pdf=True,
)

In [4]:
len(chunks)

15

In [5]:
cleaned_chunks = []
for chunk in chunks:
    text = chunk.text.strip()  # Loại bỏ khoảng trắng thừa
    if len(text) > 50:  # Giữ lại chỉ các chunk có nội dung đủ dài
        cleaned_chunks.append(text)

In [6]:
# 1. Khởi tạo embedding model (vd: BAAI/bge-small)
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1",  # Hoặc "BAAI/bge-base-en" cho tiếng Anh
    model_kwargs={"device": "cpu"}  # Chạy trên GPU nếu có
)

# 2. Tạo vector database từ chunks
vector_db = FAISS.from_texts(cleaned_chunks, embedding_model)

# 3. Lưu database để tái sử dụng
vector_db.save_local("bert_faiss_index")  # Lưu vào thư mục local

  embedding_model = HuggingFaceEmbeddings(


# Load Model

In [7]:
# Step 2: Cấu hình thiết bị
device = torch.device("cpu")  # Force CPU

# Step 3: Load tokenizer và model for CPU use
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model without BitsAndBytes quantization (for CPU)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use regular precision for CPU
    do_sample=True,
    temperature=0.6,
    top_p=0.7,
    top_k=20,
    repetition_penalty=1.2,
    low_cpu_mem_usage=True  # Helps with CPU memory management
)

# Chuyển mô hình sang chế độ đánh giá (eval)
model.eval()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotary_emb): Qw

## Prompt engineering

In [12]:
prompt_template = """
<s>[INST] <<SYS>>
You are an AI expert researcher who has thoroughly studied the provided document. Your task is to answer the question based solely on the information in the document using a Chain of Thought (CoT) approach. Follow these guidelines:
1. Reason step-by-step to ensure accuracy and completeness, explaining each step clearly.
2. Provide a concise, accurate, and detailed answer within 800 tokens, focusing only on components or information explicitly described in the document.
3. For each key element (e.g., component, method, or concept), include a brief description (1-2 sentences) of its role and functionality, referencing the specific page or section (e.g., Page X, Section Y).
4. Verify that all technical terms and references are accurate and match the document exactly.
5. Include all primary elements relevant to the question (e.g., components, methods, or metrics) as specified in the document.
6. If the answer exceeds 800 tokens, indicate it is partial and suggest follow-up questions.
7. If the document does not provide enough information, state: "The document does not provide sufficient information."

**Chain of Thought Steps**:
1. **Understand the Question**: Identify what the question asks and its scope (e.g., components, functionality, or metrics).
2. **Identify Relevant Sections**: Determine which parts of the document contain relevant information for the question.
3. **List Key Elements**: Extract and list all primary elements (e.g., components, methods) explicitly mentioned in the document.
4. **Summarize Details**: For each element, summarize its role and functionality in 1-2 sentences, ensuring technical accuracy and page references. Then explain why use like that
5. **Verify Accuracy**: Double-check that elements, terms, and references are accurate and avoid assumptions or errors.
</SYS>>

Context: {context}

Question: {question}
</INST>
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template
)

## Set up

In [15]:
streamer = TextStreamer(tokenizer)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=2024,
    truncation=True,
    device_map="cpu",
    streamer=streamer
)
retriever = vector_db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)
# Bọc pipeline vào LangChain
llm = HuggingFacePipeline(pipeline=pipe)

from langchain.prompts import PromptTemplate

# Prompt cho map (context->question)
question_prompt = PromptTemplate(
    input_variables=["context","question"],
    template="Context: {context}\nQuestion: {question}\nAnswer:"
)

# Prompt cho reduce, giờ dùng context thay vì summaries
combine_prompt = PromptTemplate(
    input_variables=["context","question"],
    template="Merged Context: {context}\nQuestion: {question}\nAnswer:"
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="map_reduce",
    return_source_documents=True,
    chain_type_kwargs={
        "question_prompt": question_prompt,
        "combine_prompt": combine_prompt,
        "combine_document_variable_name": "context"
    }
)

Device set to use cpu


## Inference

In [None]:
query="Who is the author of the paper?"
result = qa_chain({"query": query})
print(result["result"])

it
Answer:<｜begin▁of▁sentence｜>Context: 4.2. CIFAR-10 and Analysis

We conducted more studies on the CIFAR-10 dataset [20], which consists of 50k training images and 10k test- ing images in 10 classes. We present experiments trained on the training set and evaluated on the test set. Our focus is on the behaviors of extremely deep networks, but not on pushing the state-of-the-art results, so we intentionally use simple architectures as follows.

The plain/residual architectures follow the form in Fig. 3 (middle/right). The network inputs are 32 32 images, with × the per-pixel mean subtracted. The ﬁrst layer is 3 3 convo- × lutions. Then we use a stack of 6n layers with 3 3 convo- × 32,16,8 lutions on the feature maps of sizes respectively, { } with 2n layers for each feature map size. The numbers of ﬁlters are 16,32,64 respectively. The subsampling is per- { } formed by convolutions with a stride of 2. The network ends with a global average pooling, a 10-way fully-connected layer, and s