In [12]:
!pip install pandas openpyxl sqlite-utils
!pip install sentence-transformers faiss-cpu
!pip install transformers torch tqdm




In [13]:
import os
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline


In [14]:
DATA_PATH = "MoSPI_RAG_Dataset.xlsx"

df = pd.read_excel(DATA_PATH)
df.head()


Unnamed: 0,document_id,title,date_published,category,url,summary,pdf_link,extracted_text
0,1,Press Note on Consumer Price Index (CPI),2024-06-12,Economic Indicators,https://mospi.gov.in/press-release/cpi-june-2024,The Consumer Price Index for June 2024 shows a...,https://mospi.gov.in/sites/default/files/repor...,The Consumer Price Index (CPI) for June 2024 i...
1,2,Index of Industrial Production (IIP) April 2024,2024-05-28,Industrial Statistics,https://mospi.gov.in/press-release/iip-april-2024,Industrial production recorded a growth of 5.2...,https://mospi.gov.in/sites/default/files/repor...,The Index of Industrial Production (IIP) grew ...
2,3,Employment Situation in India – Quarterly Report,2024-03-15,Labour Statistics,https://mospi.gov.in/publication/employment-qu...,This report presents key employment indicators...,https://mospi.gov.in/sites/default/files/repor...,The quarterly employment survey indicates an i...


In [15]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   document_id     3 non-null      int64 
 1   title           3 non-null      object
 2   date_published  3 non-null      object
 3   category        3 non-null      object
 4   url             3 non-null      object
 5   summary         3 non-null      object
 6   pdf_link        3 non-null      object
 7   extracted_text  3 non-null      object
dtypes: int64(1), object(7)
memory usage: 324.0+ bytes


In [16]:
# Rename columns (safe for SQL & pipelines)
df.columns = [col.strip().lower() for col in df.columns]

# Drop empty rows
df.dropna(subset=["title", "extracted_text"], inplace=True)

# Convert date to standard format
df["date_published"] = pd.to_datetime(df["date_published"], errors="coerce")

df.reset_index(drop=True, inplace=True)
df.head()


Unnamed: 0,document_id,title,date_published,category,url,summary,pdf_link,extracted_text
0,1,Press Note on Consumer Price Index (CPI),2024-06-12,Economic Indicators,https://mospi.gov.in/press-release/cpi-june-2024,The Consumer Price Index for June 2024 shows a...,https://mospi.gov.in/sites/default/files/repor...,The Consumer Price Index (CPI) for June 2024 i...
1,2,Index of Industrial Production (IIP) April 2024,2024-05-28,Industrial Statistics,https://mospi.gov.in/press-release/iip-april-2024,Industrial production recorded a growth of 5.2...,https://mospi.gov.in/sites/default/files/repor...,The Index of Industrial Production (IIP) grew ...
2,3,Employment Situation in India – Quarterly Report,2024-03-15,Labour Statistics,https://mospi.gov.in/publication/employment-qu...,This report presents key employment indicators...,https://mospi.gov.in/sites/default/files/repor...,The quarterly employment survey indicates an i...


In [17]:
os.makedirs("data/processed", exist_ok=True)

conn = sqlite3.connect("data/processed/mospi_documents.db")

df.to_sql(
    "documents",
    conn,
    if_exists="replace",
    index=False,
    dtype={col: "TEXT" for col in df.columns}
)

conn.close()

print("✅ Data saved to SQLite")


✅ Data saved to SQLite


In [18]:
def chunk_text(text, chunk_size=800, overlap=200):
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)

    return chunks
chunks = []

for _, row in df.iterrows():
    text_chunks = chunk_text(row["extracted_text"])
    for ch in text_chunks:
        chunks.append({
            "document_id": row["document_id"],
            "title": row["title"],
            "url": row["url"],
            "chunk_text": ch
        })

chunk_df = pd.DataFrame(chunks)
chunk_df.head()


Unnamed: 0,document_id,title,url,chunk_text
0,1,Press Note on Consumer Price Index (CPI),https://mospi.gov.in/press-release/cpi-june-2024,The Consumer Price Index (CPI) for June 2024 i...
1,2,Index of Industrial Production (IIP) April 2024,https://mospi.gov.in/press-release/iip-april-2024,The Index of Industrial Production (IIP) grew ...
2,3,Employment Situation in India – Quarterly Report,https://mospi.gov.in/publication/employment-qu...,The quarterly employment survey indicates an i...


In [19]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embedding_model.encode(
    chunk_df["chunk_text"].tolist(),
    show_progress_bar=True
)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(np.array(embeddings))

print("✅ FAISS index created")
print("Total vectors:", index.ntotal)


✅ FAISS index created
Total vectors: 3


In [21]:
generator = pipeline(
    "text-generation",
    model="gpt2"
)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [22]:
def ask_question(question, top_k=3):
    # Embed question
    q_embedding = embedding_model.encode([question])

    # Retrieve top-k chunks
    distances, indices = index.search(np.array(q_embedding), top_k)

    context = ""
    citations = set()

    for idx in indices[0]:
        context += chunk_df.iloc[idx]["chunk_text"] + "\n"
        citations.add(chunk_df.iloc[idx]["url"])

    prompt = f"""
Answer strictly from the context below.
If the answer is not present, say:
"I don't have that information in my data."

Context:
{context}

Question:
{question}

Answer:
"""

    response = generator(prompt, max_length=300, do_sample=False)[0]["generated_text"]

    return response, list(citations)


In [25]:
answer, sources = ask_question(
    "Summarize the Index of Industrial Production report."
)

print("ANSWER:\n")
print(answer)

print("\nSOURCES:")
for s in sources:
    print(s)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


ANSWER:


Answer strictly from the context below.
If the answer is not present, say:
"I don't have that information in my data."

Context:
The Index of Industrial Production (IIP) grew by 5.2% in April 2024, driven by manufacturing sector growth...
The Consumer Price Index (CPI) for June 2024 increased by 4.8 percent compared to last year...
The quarterly employment survey indicates an improvement in labour force participation rate...


Question:
Summarize the Index of Industrial Production report.

Answer:

The Index of Industrial Production (IIP) grew by 5.2% in April 2024, driven by manufacturing sector growth...

The Consumer Price Index (CPI) for June 2024 increased by 4.8 percent compared to last year...

The quarterly employment survey indicates an improvement in labour force participation rate...


Question:

Summarize the Index of Industrial Production report.

Answer:

The Index of Industrial Production (IIP) grew by 5.2% in April 2024, driven by manufacturing sector growth..