# RAG pipeline

In [117]:
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import FAISS
import pandas as pd
import psycopg2
from langchain.embeddings import HuggingFaceEmbeddings



In [118]:
# Step 1: Load Data from CSV
csv_path = "rag_data.csv"  # Replace with your CSV file path
loader = CSVLoader(csv_path)
data = loader.load()

In [119]:
print(data)

[Document(metadata={'source': 'rag_data.csv', 'row': 0}, page_content='id: 1\ntext: Artificial Intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think and act like humans.'), Document(metadata={'source': 'rag_data.csv', 'row': 1}, page_content='id: 2\ntext: Machine Learning is a subset of AI that involves training algorithms to learn from and make predictions or decisions based on data.'), Document(metadata={'source': 'rag_data.csv', 'row': 2}, page_content="id: 3\ntext: Deep Learning is a specialized form of Machine Learning that uses neural networks with many layers (hence 'deep') to analyze various data types."), Document(metadata={'source': 'rag_data.csv', 'row': 3}, page_content='id: 4\ntext: Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and human language.'), Document(metadata={'source': 'rag_data.csv', 'row': 4}, page_content='id: 5\ntext: Reinforcement Learning is a type of 

In [120]:
# Step 2: Split Data into Documents
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(data)

In [121]:
print(documents)

[Document(metadata={'source': 'rag_data.csv', 'row': 0}, page_content='id: 1\ntext: Artificial Intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think and act like humans.'), Document(metadata={'source': 'rag_data.csv', 'row': 1}, page_content='id: 2\ntext: Machine Learning is a subset of AI that involves training algorithms to learn from and make predictions or decisions based on data.'), Document(metadata={'source': 'rag_data.csv', 'row': 2}, page_content="id: 3\ntext: Deep Learning is a specialized form of Machine Learning that uses neural networks with many layers (hence 'deep') to analyze various data types."), Document(metadata={'source': 'rag_data.csv', 'row': 3}, page_content='id: 4\ntext: Natural Language Processing (NLP) is a field of AI that focuses on the interaction between computers and human language.'), Document(metadata={'source': 'rag_data.csv', 'row': 4}, page_content='id: 5\ntext: Reinforcement Learning is a type of 

In [122]:
# Step 3: Use Sentence-Transformers for Embeddings
# Load a pre-trained sentence transformer model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create embeddings for documents
# embeddings = embedding_model.encode([doc.page_content for doc in documents])

# Create FAISS vector store
vectorstore = FAISS.from_documents(documents, embeddings)




In [123]:
# Step 4: Initialize LLaMA LLM via Ollama
llm = Ollama(model="llama3.1:latest")


In [124]:
# Step 5: Create the RAG Pipeline with LangChain's RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can also use "map_reduce" for larger datasets
    retriever=vectorstore.as_retriever()
)


In [125]:

# Step 6: Run a Q&A Test
query = "FPT Smart Cloud là gì"
result = qa_chain.run(query)
print(result)

FPT Smart Cloud là công ty con của FPT.


# Database integrate

In [126]:
import pandas as pd
df = pd.read_parquet('Văn Bản Pháp Luật_p100000-100019.parquet')
# Đọc file Parquet bằng pandas
parquet_file_path = 'Văn Bản Pháp Luật_p100000-100019.parquet'  # Đường dẫn đến file Parquet
df = pd.read_parquet(parquet_file_path)
df['created_date'] = pd.to_datetime(df['created_date'], format='%d/%m/%Y')
df['updated_date'] = pd.to_datetime(df['updated_date'], format='%d/%m/%Y')
df['content_raw'] =  df['content'].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
df.head()


Unnamed: 0,id,url,title,created_date,updated_date,content,content_raw
0,100000,https://thuvienphapluat.vn/van-ban/Quyen-dan-s...,Quyết định 720/QĐ-CTN năm 2020 về cho thôi quố...,2020-05-18,2020-06-03,"b'<div class=""content1"">\n <div>\n <div>\n ...","<div class=""content1"">\n <div>\n <div>\n <t..."
1,100001,https://thuvienphapluat.vn/van-ban/Quyen-dan-s...,Quyết định 719/QĐ-CTN năm 2020 về cho thôi quố...,2020-05-18,2020-06-03,"b'<div class=""content1"">\n <div>\n <div>\n ...","<div class=""content1"">\n <div>\n <div>\n <t..."
2,100002,https://thuvienphapluat.vn/van-ban/Xay-dung-Do...,Quyết định 648/QĐ-TTg năm 2020 về phê duyệt nh...,2020-05-18,2020-05-18,"b'<div class=""content1"">\n <div>\n <div>\n ...","<div class=""content1"">\n <div>\n <div>\n <t..."
3,100003,https://thuvienphapluat.vn/van-ban/Thuong-mai/...,Quyết định 1000/QĐ-UBND năm 2020 công bố danh ...,2020-05-18,2020-08-24,"b'<div class=""content1"">\n <div>\n <div>\n ...","<div class=""content1"">\n <div>\n <div>\n <t..."
4,100004,https://thuvienphapluat.vn/van-ban/Bo-may-hanh...,Quyết định 1517/QĐ-UBND năm 2020 về công bố Da...,2020-05-18,2020-09-30,"b'<div class=""content1"">\n <div>\n <div>\n ...","<div class=""content1"">\n <div>\n <div>\n <t..."


In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            20 non-null     int64         
 1   url           20 non-null     object        
 2   title         20 non-null     object        
 3   created_date  20 non-null     datetime64[ns]
 4   updated_date  20 non-null     datetime64[ns]
 5   content       20 non-null     object        
 6   content_raw   20 non-null     object        
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 1.2+ KB


In [128]:
# Kết nối tới PostgreSQL
def create_connection():
    connection = psycopg2.connect(
        host="localhost",
        port="5432",
        user="postgres",
        password="123456",
        database="db_llm"
    )
    return connection

In [129]:


connection = create_connection()

cursor = connection.cursor()

try: 
    # Chèn dữ liệu từ DataFrame vào PostgreSQL
    for index, row in df.iterrows():
        cursor.execute(
            """
            INSERT INTO page_content (type, id, url, title, created_date, updated_date, content, content_raw)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
            """,
            ("van_ban_phap_luat",row['id'], row['url'], row['title'], row['created_date'], row['updated_date'], row['content'], row['content_raw'])
        )

    # Xác nhận các thay đổi vào cơ sở dữ liệu
    connection.commit()

except Exception as e:
    print(e)

# Đóng kết nối
cursor.close()
connection.close()

print("Data inserted successfully!")


duplicate key value violates unique constraint "page_content_pkey"
DETAIL:  Key (type, id)=(van_ban_phap_luat, 100000) already exists.

Data inserted successfully!


In [130]:
connection = create_connection()
# Query to fetch data
query = "SELECT * FROM page_content;"

# Load data into a DataFrame
df = pd.read_sql_query(query, connection)

# Close the connection
connection.close()

df.head(2)

  df = pd.read_sql_query(query, connection)


Unnamed: 0,type,id,url,title,created_date,updated_date,content,content_raw
0,van_ban_phap_luat,100000,https://thuvienphapluat.vn/van-ban/Quyen-dan-s...,Quyết định 720/QĐ-CTN năm 2020 về cho thôi quố...,2020-05-18,2020-06-03,\x3c64697620636c6173733d22636f6e74656e7431223e...,"<div class=""content1"">\n <div>\n <div>\n <t..."
1,van_ban_phap_luat,100001,https://thuvienphapluat.vn/van-ban/Quyen-dan-s...,Quyết định 719/QĐ-CTN năm 2020 về cho thôi quố...,2020-05-18,2020-06-03,\x3c64697620636c6173733d22636f6e74656e7431223e...,"<div class=""content1"">\n <div>\n <div>\n <t..."


In [131]:

# Define the Document class
class Document:
    def __init__(self,page_content,metadata):
        self.page_content = page_content
        self.metadata = metadata
# Connect to PostgreSQL and fetch data
def fetch_data_from_postgres():
    connection = create_connection()
    query = "SELECT * FROM page_content;"
    df = pd.read_sql_query(query, connection)
    connection.close()
    return df

# Convert DataFrame to Document objects
def convert_df_to_documents(df):
    documents = [
        Document(
            page_content=f"ID: {row.id}, Title: {row.title}, Content: {row.content_raw}",  # or combine other fields if needed
            metadata={
                'id': row['id'],
                'url': row['url'],
                'title': row['title'],
                'created_date': row['created_date'],
                'updated_date': row['updated_date']
            }
        )
        for _, row in df.iterrows()
    ]
    return documents

# Fetch data
df = fetch_data_from_postgres()

# Convert to Document objects
documents = convert_df_to_documents(df)

# Example usage
# for doc in documents:
#     print(f"page_content: {doc.page_content}")  # 

  df = pd.read_sql_query(query, connection)


In [132]:
vectorstore = FAISS.from_documents(documents, embeddings)

In [133]:
# Step 5: Create the RAG Pipeline with LangChain's RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # You can also use "map_reduce" for larger datasets
    retriever=vectorstore.as_retriever()
)

In [134]:



# Step 6: Run a Q&A Test
query = "Quyết định 720/QĐ-CTN năm 2020 về cho thôi quốc tịch Việt Nam đối với 95 công dân hiện đang cư trú tại Đức do Chủ tịch nước ban hành"
result = qa_chain.run(query)
print(result)


Quyết định 720/QĐ-CTN năm 2020 về cho thôi quốc tịch Việt Nam đối với 95 công dân hiện đang cư trú tại Đức do Chủ tịch nước ban hành.

Thông tin được cung cấp ở phần đầu của văn bản, nêu rõ rằng quyết định này liên quan đến việc cho thôi quốc tịch Việt Nam đối với 95 công dân hiện đang cư trú tại Đức.


# Q&A 

In [135]:
from fuzzywuzzy import process


In [142]:
def get_faq_data():
    # Database connection parameters
    conn = create_connection()
    
    # Query to fetch data
    query = "SELECT question, answer FROM faq;"
    
    # Fetch data into a DataFrame
    df = pd.read_sql_query(query, conn)
    
    # Close the connection
    conn.close()
    
    return df

df = get_faq_data()
df.head()

  df = pd.read_sql_query(query, conn)


Unnamed: 0,question,answer
0,What is the return policy?,You can return items within 30 days of purchase.
1,How do I reset my password?,Click 'Forgot Password' on the login page.
2,What are the store hours?,"We are open from 9 AM to 9 PM, Monday to Satur..."
3,Where can I find my order history?,Log in to your account and go to 'Order History'.
4,Do you offer international shipping?,"Yes, we ship to many countries worldwide."


In [137]:

# Convert DataFrame to Document objects
def convert_df_to_documents(df):
    documents = [
        Document(
            page_content=f"question: {row.question}",  # or combine other fields if needed
            metadata={
                'question': row['question'],
                'answer': row['answer'],
            }
        )
        for _, row in df.iterrows()
    ]
    return documents


In [138]:
question_documents = convert_df_to_documents(df)

In [139]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

# Initialize HuggingFaceEmbeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

# Initialize FAISS vector store
vector_store = FAISS.from_documents(question_documents, embedding_model)




In [140]:
def search_similar_question(user_query):

    # Find the most similar question
    search_results = vector_store.similarity_search(user_query)
    
    
    return [{'question': item.metadata['question'],'answer': item.metadata['answer'] } for item in search_results]
    
    # else:
    #     return "Sorry, I couldn't find a similar question."


## Search 

In [141]:
# Example usage
user_query = "delivery"
search_similar_question(user_query)

[{'question': 'Do you offer international shipping?',
  'answer': 'Yes, we ship to many countries worldwide.'},
 {'question': 'What are the store hours?',
  'answer': 'We are open from 9 AM to 9 PM, Monday to Saturday.'},
 {'question': 'Where can I find my order history?',
  'answer': "Log in to your account and go to 'Order History'."},
 {'question': 'What is the return policy?',
  'answer': 'You can return items within 30 days of purchase.'}]