Step 1: Extract Text From PDF file

In [59]:
!pip install pymupdf



In [60]:
import fitz

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        page_text = page.get_text()
        if page_text:  # Only add non-empty text
            text += page_text + "\n"

    doc.close()
    if text.strip():
        return text
    else:
        print("Warning: No extractable text found in the PDF (might be scanned or empty).")
        return None


In [61]:
pdf_file_path = '/content/2306.12345v2.pdf.crdownload'
extracted_text = extract_text_from_pdf(pdf_file_path)

Step 2: Split the Text into Chunks

In [62]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [63]:
from nltk.tokenize import sent_tokenize

def split_text(text, max_tokens=300):
    sentences = sent_tokenize(text)
    chunks = []
    chunk = ""
    for sentence in sentences:
        if len(chunk.split()) + len(sentence.split()) <= max_tokens:
            chunk += sentence + " "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + " "
    if chunk:
        chunks.append(chunk.strip())
    return chunks


In [64]:
chunks = split_text(extracted_text)
chunks

['The Effect of Noise on the Emergence of Continuous Norms and its Evolutionary\nDynamics\nStavros Anagnou, Daniel Polani and Christoph Salge\nAdaptive Systems Research Group, School of Computer Science, University of Hertfordshire\ns.anagnou@herts.ac.uk\nAbstract\nWe examine the effect of noise on societies of agents\nusing an agent-based model of evolutionary norm emergence. Generally, we see that noisy societies are more selfish,\nsmaller and discontent, and are caught in rounds of perpetual\npunishment preventing them from flourishing. Surprisingly,\ndespite the detrimental effect of noise on the population,\nit does not seem to evolve away. We carry out further\nanalysis and provide reasons for why this might be the\ncase. Furthermore, we claim that our framework that evolves\nthe noise/ambiguity of norms is a new way to model the\ntight/loose framework of norms, suggesting that despite\nambiguous norms’ detrimental effect on society, evolution\ndoes not favour clarity. Introducti

Step 3: Convert Chunks into Embeddings using Sentence Transformers

In [65]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # or any other HuggingFace model

def get_embeddings(chunks):
    return model.encode(chunks)

In [66]:
embeddings=  get_embeddings(chunks)
embeddings

array([[-0.03070017, -0.06606123, -0.09377114, ...,  0.02346476,
        -0.02628483, -0.07389904],
       [-0.07668386, -0.02485039, -0.10057943, ..., -0.03395756,
        -0.07292785, -0.0096894 ],
       [-0.02396323, -0.11088822, -0.083583  , ...,  0.0373457 ,
        -0.04568674, -0.01966629],
       ...,
       [-0.06986897, -0.0629409 ,  0.05596201, ...,  0.01489595,
        -0.04363016, -0.01387018],
       [-0.0821212 , -0.0379324 , -0.02623501, ..., -0.07212733,
        -0.03868184, -0.02542508],
       [-0.04388836, -0.024596  ,  0.01417989, ..., -0.08192163,
        -0.02370076, -0.0494038 ]], dtype=float32)

Step 4: Store Embeddings in FAISS Vector Store

In [67]:
!pip install faiss-cpu -q
import faiss
import numpy as np

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index


In [68]:
faiss_index = create_faiss_index(embeddings)
faiss_index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7f517bb4acd0> >

In [69]:
import pickle

def save_mapping(chunks, path="chunk_mapping.pkl"):
    with open(path, "wb") as f:
        pickle.dump(chunks, f)
    return chunks

chunk_mapping = save_mapping(chunks)
if chunk_mapping is None:
    print("chunk_mapping is None! Fix text chunk creation or loading.")
else:
    print("chunk_mapping saved successfully!")

chunk_mapping saved successfully!


Step 5: RAG pipeline

In [70]:
def retrieve_similar_chunks(query, model, index, chunk_mapping, top_k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return [chunk_mapping[i] for i in indices[0]]
# result = retrieve_similar_chunks('What are the main benefits of using AI according to the document?', model, faiss_index, chunk_mapping)
# print(result)

In [71]:
import requests

def generate_answer_from_llm(context, query, api_key):
    prompt = f"""You are a helpful assistant. Use the context below to answer the question.

Context:
{context}

Question:
{query}

Answer:"""

    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        },
        json={
            "model": "meta-llama/llama-4-scout-17b-16e-instruct",  # ✅ from your curl
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.2,
            "max_tokens": 1024
        }
    )
    if response.status_code != 200:
        raise Exception(f"API error {response.status_code}: {response.text}")

    data = response.json()

    if 'choices' not in data:
        raise Exception(f"'choices' not found in response: {data}")

    return data['choices'][0]['message']['content']


context = "\n".join(result)
# result_1 = generate_answer_from_llm(context, 'What are the main benefits of using AI according to the document?', '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')
# print(result)

In [72]:
def rag_pipeline(query, faiss_index, model, chunk_mapping, api_key):
    retrieved_chunks = retrieve_similar_chunks(query, model, faiss_index, chunk_mapping)
    context = "\n".join(retrieved_chunks)
    answer = generate_answer_from_llm(context, query, api_key)
    return answer


Step 6: Getting Response

In [73]:
query = 'What are the main benefits of using AI according to the document?'
api_key = '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'
result = rag_pipeline(query, faiss_index, model, chunk_mapping, api_key)
print(result)

The document does not mention the main benefits of using AI. The text appears to be a research paper discussing the emergence of social norms in agent-based models, the effects of noise on these norms, and the evolutionary dynamics of noise. It does not mention Artificial Intelligence (AI) or its benefits.
