In [8]:
!pip install "langchain>=0.2.16" "langchain-aws>=0.2.2" boto3 requests beautifulsoup4 psycopg2-binary tqdm sentence-transformers




In [1]:
from dotenv import load_dotenv
# bedrock_pgvector_demo.py
import os, re, json, hashlib, math
import requests
from bs4 import BeautifulSoup
from datetime import datetime

load_dotenv()
from typing import List, Tuple
from tqdm import tqdm

import boto3
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_aws import BedrockEmbeddings

import psycopg2
from psycopg2.extras import execute_values

WIKI_URL = "https://vi.wikipedia.org/wiki/Chi%E1%BA%BFn_tranh_Vi%E1%BB%87t_Nam"
PG_HOST = os.getenv("PG_HOST")
PG_PORT = os.getenv("PG_PORT")
PG_DATABASE = os.getenv("PG_DATABASE")
PG_USER = os.getenv("PG_USER")
PG_PASSWORD = os.getenv("PG_PASSWORD")
REGION = os.getenv("AWS_REGION")

PG_DSN   = f"postgresql://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DATABASE}"

MODEL_ID = "amazon.titan-embed-text-v2:0"
DIMS     = 1024        # 256 / 512 / 1024 (v2 hỗ trợ)  # :contentReference[oaicite:7]{index=7}
NORMALIZE= True


In [2]:
# ---------- 1) LOAD WIKI ----------
def fetch_wiki(url: str) -> Tuple[str, str, str]:
    """Return title, lang, plain_text"""
    headers = {"User-Agent": "Mozilla/5.0 (Bedrock-Pgvector-Demo)"}
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # Title & language
    title = soup.find("h1", id="firstHeading")
    title = title.get_text(strip=True) if title else (soup.title.get_text(strip=True) if soup.title else url)
    html_lang = soup.find("html").get("lang") if soup.find("html") else "vi"

    # Lấy phần nội dung chính
    content_div = soup.find("div", id="mw-content-text")
    text = content_div.get_text(separator="\n", strip=True) if content_div else soup.get_text("\n", strip=True)

    # Loại bỏ chú thích [1], [2], [...]
    text = re.sub(r"\[\d+\]", "", text)
    # Nén khoảng trắng
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{2,}", "\n\n", text).strip()
    return title, html_lang, text

In [3]:
def split_text(text: str) -> List[str]:
    # Recursive splitter (ưu tiên giữ đoạn/câu). :contentReference[oaicite:8]{index=8}
    splitter = RecursiveCharacterTextSplitter(
        # Các separator hợp với TV: đoạn, dòng, câu, từ
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,       # theo ký tự (ổn định đa ngôn ngữ)
        chunk_overlap=150,
        length_function=len,
        add_start_index=False
    )
    chunks = [c.page_content for c in splitter.create_documents([text])]
    # Lọc chunk “quá ngắn” vô nghĩa
    chunks = [c for c in chunks if len(c) >= 50]
    return chunks

# ---------- 3) EMBEDDINGS ----------
def bedrock_embedder():
    client = boto3.client("bedrock-runtime", region_name=REGION)
    return BedrockEmbeddings(
        client=client,
        model_id=MODEL_ID,
        model_kwargs={
            "dimensions": DIMS,    # 256/512/1024
            "normalize": NORMALIZE # chuẩn hóa để dùng cosine hoặc inner product
        }
    )

def embed_texts(embedder, texts: List[str]) -> List[List[float]]:
    # LangChain sẽ gọi Bedrock runtime theo batches
    return embedder.embed_documents(texts)

def embed_query(embedder, text: str) -> List[float]:
    return embedder.embed_query(text)

In [4]:
def db_connect():
    return psycopg2.connect(PG_DSN)

def upsert_document(conn, url: str, title: str, lang: str, checksum: bytes) -> int:
    with conn, conn.cursor() as cur:
        cur.execute("""
            INSERT INTO rag.document (source_url, title, language, checksum)
            VALUES (%s, %s, %s, %s)
            ON CONFLICT (source_url) DO UPDATE
              SET title = EXCLUDED.title, language = EXCLUDED.language, checksum = EXCLUDED.checksum
            RETURNING id
        """, (url, title, lang, psycopg2.Binary(checksum)))
        doc_id = cur.fetchone()[0]
        return doc_id

def insert_chunks(conn, doc_id: int, chunks: List[str], vectors: List[List[float]]):
    # Chèn bulk; pgvector literal là chuỗi '[v1,v2,...]'
    rows = []
    for i, (txt, vec) in enumerate(zip(chunks, vectors)):
        vec_str = "[" + ",".join(f"{x:.7f}" for x in vec) + "]"
        rows.append((
            doc_id, i, txt, None, len(txt), vec_str
        ))
    with conn, conn.cursor() as cur:
        execute_values(cur, """
            INSERT INTO rag.chunk (document_id, ordinal, content, n_tokens, char_count, embedding)
            VALUES %s
        """, rows)

# ---------- 5) SEARCH ----------
def semantic_search(conn, query_vec: List[float], top_k=5):
    vec_str = "[" + ",".join(f"{x:.7f}" for x in query_vec) + "]"
    sql = """
        SELECT id, content,
               1 - (embedding <=> %s::vector) AS cosine_similarity
        FROM rag.chunk
        ORDER BY embedding <=> %s::vector ASC
        LIMIT %s
    """
    with conn.cursor() as cur:
        cur.execute(sql, (vec_str, vec_str, top_k))
        return cur.fetchall()


def hybrid_search(conn, query_text: str, query_vec: List[float], top_k=5, rrf_k=60):
    """
    Reciprocal Rank Fusion (RRF): f = 1/(K + rank)
    K ~ 60 là con số thực nghiệm phổ biến.
    """
    vec_str = "[" + ",".join(f"{x:.7f}" for x in query_vec) + "]"
    sql = f"""
    WITH sem AS (
      SELECT id, content,
             ROW_NUMBER() OVER (ORDER BY embedding <=> %s::vector ASC) AS rnk_sem
      FROM rag.chunk
      LIMIT 200
    ),
    fts AS (
      SELECT id, content,
             ROW_NUMBER() OVER (ORDER BY ts_rank_cd(content_tsv, plainto_tsquery('simple', %s)) DESC) AS rnk_fts
      FROM rag.chunk
      WHERE content_tsv @@ plainto_tsquery('simple', %s)
      LIMIT 200
    ),
    combo AS (
      SELECT COALESCE(sem.id, fts.id) AS id,
             COALESCE(sem.content, fts.content) AS content,
             (CASE WHEN sem.rnk_sem IS NULL THEN 0 ELSE 1.0/(%s + sem.rnk_sem) END) +
             (CASE WHEN fts.rnk_fts IS NULL THEN 0 ELSE 1.0/(%s + fts.rnk_fts) END) AS rrf
      FROM sem FULL OUTER JOIN fts USING (id)
    )
    SELECT id, content
    FROM combo
    ORDER BY rrf DESC
    LIMIT %s;
    """
    with conn.cursor() as cur:
        cur.execute(sql, (vec_str, query_text, query_text, rrf_k, rrf_k, top_k))
        return cur.fetchall()

In [5]:
print("== Tải Wikipedia ==")
title, lang, text = fetch_wiki(WIKI_URL)
checksum = hashlib.sha256(text.encode("utf-8")).digest()
print(f"Title: {title} | Lang: {lang} | Length: {len(text)} chars")

print("== Chia chunks bằng LangChain ==")
chunks = split_text(text)
print(f"Chunks: {len(chunks)}")

== Tải Wikipedia ==
Title: Chiến tranh Việt Nam | Lang: vi | Length: 364953 chars
== Chia chunks bằng LangChain ==
Chunks: 458


In [7]:
print("== Khởi tạo Bedrock Embeddings ==")
embedder = bedrock_embedder()

print("== Tạo embeddings ==")
vectors = embed_texts(embedder, chunks)
print("== Kết nối Postgres ==")
conn = db_connect()

print("== Upsert document & insert chunks ==")
doc_id = upsert_document(conn, WIKI_URL, title, lang, checksum)
insert_chunks(conn, doc_id, chunks, vectors)
conn.commit()

== Khởi tạo Bedrock Embeddings ==
== Tạo embeddings ==


Error raised by inference endpoint
Traceback (most recent call last):
  File "c:\Users\thanh\anaconda3\envs\python312\Lib\site-packages\langchain_aws\embeddings\bedrock.py", line 198, in _invoke_model
    response = self.client.invoke_model(
               ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\thanh\anaconda3\envs\python312\Lib\site-packages\botocore\client.py", line 602, in _api_call
    return self._make_api_call(operation_name, kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\thanh\anaconda3\envs\python312\Lib\site-packages\botocore\context.py", line 123, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\thanh\anaconda3\envs\python312\Lib\site-packages\botocore\client.py", line 1078, in _make_api_call
    raise error_class(parsed_response, operation_name)
botocore.errorfactory.ValidationException: An error occurred (ValidationException) when calling the InvokeModel operation: The provided model identifie

ValidationException: An error occurred (ValidationException) when calling the InvokeModel operation: The provided model identifier is invalid.

In [None]:
print("== Truy vấn thử (semantic) ==")
q = "Nguyên nhân và diễn biến chính của Chiến tranh Việt Nam là gì?"
qvec = embed_query(embedder, q)
rows = semantic_search(conn, qvec, top_k=5)
for rid, content, sim in rows:
    print(f"\n[#{rid}] cos_sim={sim:.4f}\n{content[:300]}...")

print("\n== Truy vấn thử (hybrid RRF) ==")
rows = hybrid_search(conn, q, qvec, top_k=5)
for rid, content in rows:
    print(f"\n[#{rid}] {content[:300]}...")

conn.close()