### https://youtu.be/-ikCYKcPoqU?si=jrirfjYtbY8Ctraq

In [4]:

import ollama
import time
import PyPDF2

import os
import logging
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
import ollama


In [5]:
def extract_text_from_pdf(file_path):
    try:
        with open(file_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
            return text
    except Exception as e:
        raise ValueError("Failed to extract text from PDF.")

In [6]:
def split_into_chunks(text, chunk_size):
    words = text.split()
    chunks = []
    current_chunk = []
    
    for word in words:
        current_chunk.append(word)
        if len(current_chunk) >= chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            
    if current_chunk:
        chunks.append(' '.join(current_chunk))
        
    return chunks

In [7]:

file_path = '/Users/user/Documents/Project/___e-zest___/Rainclouds-Global-Solutions-Assignment/pdf_data/Slice of FedRAMP SSP - Security Controls - Custom.pdf'
long_string = extract_text_from_pdf(file_path)
chunk_size = 2000
chunks = split_into_chunks(long_string, chunk_size)

processed_chunks = 0  # Initialize counter for processed chunks


In [15]:
len(chunks[0].split(" "))

2000

In [8]:
def create_embeddings(chunk):
    response = ollama.embeddings(model='nomic-embed-text', prompt=chunk)
    return response.embedding


In [9]:
#---- For local ollama nomic embedding setup ----
from langchain_ollama import OllamaEmbeddings
embeddings_local = OllamaEmbeddings(
    model="nomic-embed-text"
)

In [10]:
#---- With API key -----
import os
NOMIC_API_KEY = "nk-Aky6vgbrRPHkNsKLMZcTxeUmGbOwu6VPvC30I79_PuI"
os.environ["NOMIC_API_KEY"] = NOMIC_API_KEY
from langchain_nomic import NomicEmbeddings

embeddings_api = NomicEmbeddings(model="nomic-embed-text-v1.5")

In [11]:
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector, _get_embedding_collection_store


In [12]:
# Database connection configuration
DB_CONFIG = {
    "host": "localhost",  # Update for your setup
    "port": 6006,
    "user": "langchain",   # Update for your username
    "password": "langchain",  # Replace with your password
    "dbname": "vector_store",
}

In [13]:
# Step 1: Initialize the Vector Store
def connect_to_pgvector():
    #response = ollama.embeddings(model='nomic-embed-text', prompt=chunk)
    # PGVector requires the table to exist beforehand
    connection_string = f"postgresql+psycopg://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['dbname']}"
    collection_name = "embeddings" # its not table name its actually - embeddings - 1fc1de07-0269-4725-9a17-e32fea717182 [collection id]
                                   # in langchain_pg_collection  table
    vector_store = PGVector(
        embeddings=embeddings_api,
        collection_name=collection_name,
        connection=connection_string,
        use_jsonb=True,
    )
    return vector_store

In [14]:
vector_store = connect_to_pgvector()
vector_store

<langchain_postgres.vectorstores.PGVector at 0x12b5091f0>

In [15]:
# insert embeddings into vector store ie postgres
def insert_embeddings(vector_store, text_chunks):
    docs = []
    for i, chunk in enumerate(text_chunks):
        doc = Document(
            page_content=chunk,
            # metadata={"id": i,"chunk_text":chunk,"embedding":create_embeddings(chunk)}
            metadata={"id": i}
        )
        docs.append(doc)
    print(docs)
        
    vector_store.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])
    print("Embeddings inserted successfully into the 'embeddings' table!")
insert_embeddings(vector_store, chunks)

[Document(metadata={'id': 0}, page_content='Access Control AC-01 Policy and Procedures a. Develop, document, and disseminate to [Assignment: organization-defined personnel or roles]: 1. [Selection (one or more): Organization-level; Mission/business process-level; System-level] access control policy that: (a) Addresses purpose, scope, roles, responsibilities, management commitment, coordination among organizational entities, and compliance; and (b) Is consistent with applicable laws, executive orders, directives, regulations, policies, standards, and guidelines; and 2. Procedures to facilitate the implementation of the access control policy and the associated access controls; b. Designate an [Assignment: organization-defined official] to manage the development, documentation, and dissemination of the access control policy and procedures; and c. Review and update the current access control: 1. Policy [FedRAMP Assignment: at least annually] and following [Assignment: organization-defined 

In [16]:
#-------- Just testing connection -------
import psycopg2
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
cur.execute('SELECT 1')
result = cur.fetchone()
#print(f"result - {result}")
if result and result[0] == 1:
    print("Test query executed successfully. Connection is valid.")
else:
    print("Test query did not return the expected result.")

Test query executed successfully. Connection is valid.


In [17]:
# ---- display data thet is stored which is not needed in actually ----
import psycopg2
def read_data():
    try:
        conn = psycopg2.connect(**DB_CONFIG)
        cursor = conn.cursor()
        query = "SELECT * FROM public.langchain_pg_embedding;"
        cursor.execute(query)
        rows = cursor.fetchall()
        for row in rows:
            print(f"ID: {row[0]}, Text: {row[3][:50]}, Embedding: {row[2][:100]}")
        cursor.close()
        conn.close()
    except Exception as e:
        print(f"Error reading data: {e}")
read_data()

ID: 0, Text: Access Control AC-01 Policy and Procedures a. Deve, Embedding: [0.028945923,0.033416748,-0.14819336,-0.0181427,0.072509766,-0.026672363,0.060760498,0.04385376,0.08
ID: 1, Text: apply permissions to that user's session. Customer, Embedding: [0.01687622,0.055786133,-0.15332031,-0.01965332,0.060913086,-0.031921387,0.04498291,0.047302246,0.07
ID: 2, Text: to manage user accounts in AD, see the following l, Embedding: [0.030319214,0.042388916,-0.17529297,-0.034118652,0.045715332,-0.025115967,0.0647583,0.03250122,0.09
ID: 3, Text: to privileged roles will be retained for 90 days; , Embedding: [0.014953613,0.04949951,-0.16845703,-0.032043457,0.04849243,-0.03366089,0.04940796,0.046081543,0.050
ID: 4, Text: systems to operate, blocking all other ports, prot, Embedding: [0.04837036,0.079956055,-0.15332031,-0.06390381,0.08062744,-0.064819336,0.046173096,0.03451538,0.038
ID: 5, Text: Origination (check all that apply): ☐ Service Prov, Embedding: [0.035736084,0.068237305,-0.1484375,-0.

In [54]:
from langchain.prompts import PromptTemplate
from langchain_ollama import ChatOllama

# Initialize the local Llama model
llm = ChatOllama(model="qwen:1.8b")


In [32]:
from langchain.prompts import PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever

def create_retriever(vector_store, llm):
    QUERY_PROMPT = PromptTemplate(
        input_variables=["question"],
        template="""You are assisting with retrieving relevant data from a System Security Plan (SSP). 
Generate three alternative versions of the user's question to better retrieve relevant documents.
Original question: {question}
"""
    )

    # Create MultiQueryRetriever
    retriever = MultiQueryRetriever.from_llm(
        retriever=vector_store.as_retriever(),  # Convert the vector store to a retriever
        llm=llm,  
        prompt=QUERY_PROMPT  
    )

    print("Retriever created successfully!")
    return retriever
retriever = create_retriever(vector_store, llm)

Retriever created successfully!


In [55]:

def similarity_check(quesiton, vector_store):
    docs = vector_store.similarity_search(quesiton)
    return docs

In [None]:
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever
retriever = BM25Retriever.from_documents(
    [
       
    ]
)

In [42]:
# def create_retriever_(vector_store, llm):
#     from langchain.chains import create_retrieval_chain
#     from langchain.chains.combine_documents import create_stuff_documents_chain
#     from langchain_core.prompts import ChatPromptTemplate

#     QUERY_PROMPT = PromptTemplate(
#         input_variables=["question"],
#         template="""You are assisting with retrieving relevant data from a System Security Plan (SSP). 
#         Generate three alternative versions of the user's question to better retrieve relevant documents.
#         Original question: {question}
#         """
#             )

#     prompt = ChatPromptTemplate.from_messages(
#         [
#             ("system", QUERY_PROMPT),
#             ("human", "{input}"),
#         ]
#     )


#     question_answer_chain = create_stuff_documents_chain(llm, prompt)
#     rag_chain = create_retrieval_chain(retriever, question_answer_chain)

#     response = rag_chain.invoke({"input": "What is Task Decomposition?"})
#     print(response["answer"])
    

In [56]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
def create_chain(llm, retriever_similarity):
    RAG_TEMPLATE = """You are a security assistant. Answer the question based on the following SSP context:
                {context}
                Question: {question}

                Provide a clear, concise, and accurate answer.
                """
    rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)
    rag_chain = (
                    RunnablePassthrough.assign(context=lambda input: format_docs(input["context"]))
                    | rag_prompt
                    | llm
                    | StrOutputParser()
                )
    print("Chain created successfully!")
    return rag_chain
question = "What is the Control Origination for AC-06?"
retriever_similarity = similarity_check(question, vector_store)
chain = create_chain(llm, retriever_similarity)



Chain created successfully!


In [57]:
response = chain.invoke({"context": retriever_similarity, "question": question})

In [58]:
response

'The Control Origination (CO) for AC-06 refers to the set of actions taken by authorized users (AAU) during the operation of the product.\n\nAC-06 is a software program that implements a multi-user access control system for a specific product or service. The CO specifies the set of actions taken by authorized users (AAU) during the operation of the product.\n\nThe CO specifies the following set of actions:\n\n1. A User (AU) requests access to an area within the product that has been allocated to them.\n2. A User (AU) executes a login request by sending their username and password as credentials in the HTTP request.\n3. The System (S) validates the login request using the stored credentials or other authentication mechanisms available within the CO.\n4. Once the login request is validated successfully, the System (S) grants access to the requested area within the product.\n\nTherefore, the Control Origination (CO) for AC-06 specifies a set of actions taken by authorized users (AAU) duri

In [59]:
len(response)

1382

In [None]:
from fastapi import FastAPI, UploadFile, HTTPException
def fetch_all_documents():
    try:
        conn = psycopg2.connect(**DB_CONFIG)
        cur = conn.cursor()
        # Assuming the collection name is 'embeddings'
        query = "SELECT * FROM public.langchain_pg_embedding;"
        cur.execute(query)
        rows = cur.fetchall()
        cur.close()
        conn.close()
        # Convert rows into a list of Document objects
        docs = [Document(page_content=row[3], metadata={"id": row[0], "embedding": row[2]}) for row in rows]
        return docs
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error fetching documents: {str(e)}")

fetch_all_documents()

[('0', '1fc1de07-0269-4725-9a17-e32fea717182', '[0.028945923,0.033416748,-0.14819336,-0.0181427,0.072509766,-0.026672363,0.060760498,0.04385376,0.08013916,0.06500244,-0.0057373047,-0.0063095093,0.08215332,0.01826477,0.055633545,0.01360321,0.004348755,-0.022506714,-0.045684814,-0.035614014,0.03894043,-0.058563232,-0.02708435,-0.01928711,0.090270996,-0.0010824203,0.015548706,-0.035858154,-0.08728027,-0.022872925,0.04550171,0.023956299,0.043914795,-0.024917603,0.015106201,-0.09844971,0.036743164,0.030303955,-0.028656006,0.006427765,0.03274536,-0.02784729,-0.005115509,-0.029907227,-0.0056533813,-0.014595032,0.05722046,0.00094366074,0.06390381,-0.037231445,0.045837402,-0.026367188,-0.0054893494,0.023620605,0.072387695,-0.0115356445,-0.011528015,0.046691895,-0.005027771,-0.028640747,0.08520508,0.08605957,-0.0007867813,0.05847168,0.050811768,0.018295288,0.014595032,0.035858154,0.0054893494,-0.04937744,0.019241333,-0.01373291,0.033599854,-0.027267456,0.020385742,-0.062042236,-0.04458618,-0.049

[Document(metadata={'id': '0', 'embedding': '[0.028945923,0.033416748,-0.14819336,-0.0181427,0.072509766,-0.026672363,0.060760498,0.04385376,0.08013916,0.06500244,-0.0057373047,-0.0063095093,0.08215332,0.01826477,0.055633545,0.01360321,0.004348755,-0.022506714,-0.045684814,-0.035614014,0.03894043,-0.058563232,-0.02708435,-0.01928711,0.090270996,-0.0010824203,0.015548706,-0.035858154,-0.08728027,-0.022872925,0.04550171,0.023956299,0.043914795,-0.024917603,0.015106201,-0.09844971,0.036743164,0.030303955,-0.028656006,0.006427765,0.03274536,-0.02784729,-0.005115509,-0.029907227,-0.0056533813,-0.014595032,0.05722046,0.00094366074,0.06390381,-0.037231445,0.045837402,-0.026367188,-0.0054893494,0.023620605,0.072387695,-0.0115356445,-0.011528015,0.046691895,-0.005027771,-0.028640747,0.08520508,0.08605957,-0.0007867813,0.05847168,0.050811768,0.018295288,0.014595032,0.035858154,0.0054893494,-0.04937744,0.019241333,-0.01373291,0.033599854,-0.027267456,0.020385742,-0.062042236,-0.04458618,-0.049041