# Extract and Preprocess Documents

In [1]:
import pdfplumber
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Load the PDF and extract text
pdf_path = "anual.pdf"
data = []

with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        data.append({"page_number": i, "text": text})


In [3]:
# Convert to a DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,page_number,text
0,0,2022 - 23\nवा�ष�क ��तवेदन\nAnnual Report\nआईआई...
1,1,पृ�/ Page 2\nWinning Entry: Eco-Friendly\nGane...
2,2,पृ�/ Page 3\n/ Contents\nअनु�म�णका\n4-5 प�रचय ...
3,3,पृ�/ Page 4\nप�रचय\nभारतीय सूचना �ौ�ोिगक� स�ं ...
4,4,पृ�/ Page 5\nIntroduction\nThe Indian Institut...


In [4]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(min_df=3, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Define Query Processing in the RAG Model

In [5]:
# Transform the query using the trained TF-IDF vectorizer
def retrieve_relevant_docs(query, vectorizer, tfidf_matrix, df, top_n=3):
    query_vec = vectorizer.transform([query])
    cosine_similarities = (tfidf_matrix * query_vec.T).toarray().flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]  # Get top N matches
    return df.iloc[top_indices]

In [6]:
# Example query
query = "What is the placement status of students?"
relevant_docs = retrieve_relevant_docs(query, vectorizer, tfidf_matrix, df)
print(relevant_docs)

    page_number                                               text
39           39  पृ / Page 40\n/Placements\n ेसम ट\nAt IIITR, o...
95           95  पृ / Page 96\nCOSA\nछा  प रषद ( ) /\nCouncil o...
93           93  पृ / Page 94\nआईआईटीआर छा  /\nIIITR Students\n...


# Integrate with the Smaller Model

In [7]:
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Configure the Generative AI API
genai.configure(api_key="AIzaSyAfbxycvy1I2EIKSKKwKXufHfc44NDBsh8")
model = genai.GenerativeModel("gemini-1.5-flash")

In [9]:
# Combine relevant documents into a single context
def prepare_context(relevant_docs):
    context = " ".join(relevant_docs['text'])
    return f"Context: {context}\n\n"

In [10]:
# Generate a response using Google Generative AI
def generate_response_with_genai(prompt, context):
    response = model.generate_content(context + prompt)
    return response.text

In [11]:
# Example
context = prepare_context(relevant_docs)
prompt = "What is General secretary."
response = generate_response_with_genai(context,prompt)
print(response)

In the context of the IIIT Raichur Council of Student Affairs (COSA), the General Secretary is a key member of the student council.  The General Secretary, in this instance held by Abhyuday Choumal and Beerelly Srinitha, has responsibilities that include:

* **Representation:** Representing the entire student body.  They were part of the IIITR senate.
* **Coordination:** Coordinating the functionalities of student clubs.
* **Issue Management:** Addressing mess and hostel-related issues.
* **Event Organization:** Hosting various academic and non-academic events.
* **Liaison:**  Holding meetings with college authorities and secretaries as needed.
* **General Body Matters:** Organizing and coordinating matters concerning the General Body of students.

Essentially, the General Secretary acts as a vital link between the student body and the college administration, handling a wide range of administrative and organizational tasks to support student life.



#  Combine into a Unified Pipeline (Updated)


In [12]:
def unified_pipeline_with_genai(query, pdf_path):
    # Step 1: Extract text from the PDF
    data = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            data.append({"page_number": i, "text": text})
    df = pd.DataFrame(data)
    
    # Step 2: TF-IDF vectorization
    vectorizer = TfidfVectorizer(min_df=3, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['text'])
    
    # Step 3: Retrieve relevant documents
    relevant_docs = retrieve_relevant_docs(query, vectorizer, tfidf_matrix, df)
    context = prepare_context(relevant_docs)
    
    # Step 4: Generate a response using Google Generative AI
    response = generate_response_with_genai(query, context)
    return response




In [13]:
response = model.generate_content("Tell me who is the General secretary.")
response.text

'The term "General Secretary" can refer to different positions in different organizations.  To answer your question accurately, I need to know **which organization** you\'re asking about.  For example, are you asking about the General Secretary of:\n\n* The United Nations?\n* A specific political party (e.g., the Communist Party of China)?\n* A trade union?\n* A specific non-profit organization?\n\nPlease specify the organization.\n'

In [14]:
# Example
query = "Tell me who is the General secretary."
pdf_path = "anual.pdf"
final_response = unified_pipeline_with_genai(query, pdf_path)
print(final_response)

The General Secretaries for the period April 2022 – March 2023 were Abhyuday Choumal and Beerelly Srinitha.

