In [None]:
!pip install -U datasets


In [None]:
from datasets import load_dataset

ds = load_dataset(
    "allenai/qasper",
    revision="refs/convert/parquet"
)

print(ds)


In [None]:
train_ds = ds["train"]
print(len(train_ds))
train_ds[0]


In [None]:
sample = train_ds[0]
sample.keys()


In [None]:
sample_qas = train_ds[0]["qas"]["question"][0]
sample_qas

In [None]:
qas = train_ds[0]["qas"]

for i in range(len(qas["question"])):
    print(f"Question: {qas["question"][i]}")
    # Access the first free_form_answer for the current question
    if len(qas["answers"]) > i and len(qas["answers"][i]["answer"]) > 0:
        print(f"Answer: {qas["answers"][i]["answer"][0]["free_form_answer"]}")
    else:
        print("Answer: Not available or unanswerable.")

#Preprocessing

In [None]:
processed_data = []

for item in train_ds:
    paper_id = item["id"]
    title = item["title"]
    abstract = item["abstract"]
    context = item["full_text"]

    qas_data = item["qas"]

    for i in range(len(qas_data["question"])):
        question = qas_data["question"][i]

        answer = ""
        # Check if there's an answer entry for this question index
        if i < len(qas_data["answers"]) and len(qas_data["answers"][i]["answer"]) > 0:
            # Get the free_form_answer from the first answer object
            answer = qas_data["answers"][i]["answer"][0]["free_form_answer"]

        # Note: 'answer_type' is not directly available at this level of the 'qas' structure
        # and would need to be inferred or extracted from the 'answer' object if needed.

        processed_data.append({
            "paper_id": paper_id,
            "title": title,
            "abstract": abstract,
            "question": question,
            "answer": answer,
            "context": context
        })

In [None]:
import pandas as pd

df = pd.DataFrame(processed_data)
df.head()


In [None]:
df.isnull().sum()


In [None]:
df = df[df["answer"].str.len() > 0]


In [None]:
df["question_length"] = df["question"].apply(lambda x: len(x.split()))
df["answer_length"]   = df["answer"].apply(lambda x: len(x.split()))
df["context_length"]  = df["context"].apply(lambda x: len(' '.join([' '.join(section) for section in x['paragraphs']]).split()))

#DATA VISUALIZATION PART

In [None]:
import matplotlib.pyplot as plt

df["answer"].unique()

def clean_answer(x):
    if not isinstance(x, str):
        return "unknown"
    x = x.lower()
    if "extractive" in x:
        return "extractive"
    elif "abstractive" in x:
        return "abstractive"
    elif "yes" in x:
        return "yes/no"
    elif "unanswerable" in x:
        return "unanswerable"
    else:
        return "unknown"

df["answer_clean"] = df["answer"].apply(clean_answer)



counts = df["answer_clean"].value_counts()

plt.figure(figsize=(6,4))
counts.plot(kind="bar")
plt.title("Answer Type Distribution (Cleaned)")
plt.xlabel("Answer Type")
plt.ylabel("Count")
plt.show()



In [None]:
plt.hist(df["question_length"], bins=30)
plt.title("Question Length Distribution")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()


In [None]:
plt.hist(df["answer_length"], bins=30)
plt.title("Answer Length Distribution")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()


In [None]:
plt.hist(df["context_length"], bins=50)
plt.title("Context Length Distribution")
plt.xlabel("Words in Full Paper")
plt.ylabel("Frequency")
plt.show()


In [None]:
plt.scatter(df["question_length"], df["answer_length"])
plt.xlabel("Question Length")
plt.ylabel("Answer Length")
plt.title("Question vs Answer Length")
plt.show()


In [None]:
from collections import Counter

all_questions = " ".join(df["question"])
words = all_questions.lower().split()

common_words = Counter(words).most_common(20)

words, counts = zip(*common_words)

plt.bar(words, counts)
plt.xticks(rotation=45)
plt.title("Top 20 Question Words")
plt.show()


#Text Chunking

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/all-mpnet-base-v2"
)

def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        chunk = words[start : start + chunk_size]
        chunks.append(" ".join(chunk))
        start += chunk_size - overlap

    return chunks


#Apply Chunking on Context

In [None]:
all_chunks = []
metadata = []

for idx, row in df.iterrows():
    # Extract and join the text from 'paragraphs' list within the 'context' dictionary
    full_text_content = ' '.join([' '.join(section) for section in row["context"]['paragraphs']])
    chunks = chunk_text(full_text_content)

    for ch in chunks:
        all_chunks.append(ch)
        metadata.append({
            "paper_id": row["paper_id"],
            "question": row["question"]
        })

In [None]:
!pip install sentence-transformers faiss-cpu


#Text Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-mpnet-base-v2")

embeddings = embed_model.encode(
    all_chunks,
    show_progress_bar=True
)


In [None]:
import faiss
import numpy as np

dimension = embeddings.shape[1]

index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

print("Total vectors:", index.ntotal)


In [None]:
faiss.write_index(index, "qasper_faiss.index")


In [None]:
def retrieve_chunks(query, k=5):
    q_emb = embed_model.encode([query])
    D, I = index.search(np.array(q_emb), k)
    return [all_chunks[i] for i in I[0]]


In [None]:
def build_prompt(question, contexts):
    context_text = "\n\n".join(contexts)
    prompt = f"""
You are an AI assistant answering questions from research papers.

Context:
{context_text}

Question:
{question}

Answer clearly and concisely.
"""
    return prompt


In [None]:
from transformers import pipeline

llm = pipeline(
    "text-generation",
    model="google/flan-t5-base",
    max_new_tokens=200
)

def rag_answer(question):
    contexts = retrieve_chunks(question)
    prompt = build_prompt(question, contexts)
    result = llm(prompt)
    return result[0]["generated_text"]


In [None]:
rag_answer("What is the main contribution of the paper?")


In [None]:
def format_qa(row):
    return {
        "input_text": f"question: {row['question']} context: {row['context']}",
        "target_text": row["answer"]
    }

# Determine the actual number of samples to take
num_samples = min(3000, len(df))

# Sample the DataFrame
train_data = df.sample(n=num_samples, random_state=42).apply(format_qa, axis=1).tolist()

In [None]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

def tokenize(batch):
    inputs = tokenizer(
        batch["input_text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    targets = tokenizer(
        batch["target_text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    inputs["labels"] = targets["input_ids"]
    return inputs


In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

args = TrainingArguments(
    output_dir="./qasper_ft",
    per_device_train_batch_size=2,
    num_train_epochs=2,
    logging_steps=100,
    save_steps=500
)


In [None]:
import gradio as gr

def chatbot(question, history):
    answer = rag_answer(question)
    history.append((question, answer))
    return history, ""

with gr.Blocks(title="Research Paper Q&A Bot") as demo:

    gr.Markdown("""
    # ðŸ“„ Research Paper Q&A Bot (QASPER)
    Ask questions about research papers using **LLM + FAISS (RAG pipeline)**.

    ðŸ”¹ Powered by QASPER Dataset
    ðŸ”¹ Supports long research papers
    ðŸ”¹ Evidence-based answers
    """)

    chatbot_ui = gr.Chatbot(
        label="ðŸ’¬ Chat",
        height=400
    )

    with gr.Row():
        question_box = gr.Textbox(
            placeholder="Ask a question about the paper...",
            label="Your Question",
            lines=2
        )

    with gr.Row():
        submit_btn = gr.Button("ðŸš€ Ask")
        clear_btn = gr.Button("ðŸ§¹ Clear")

    gr.Markdown("""
    ### ðŸ§ª Example Questions
    - What is the main contribution of the paper?
    - What dataset is used in the experiments?
    - What evaluation metrics are used?
    - Does the paper compare with previous methods?
    """)

    submit_btn.click(
        fn=chatbot,
        inputs=[question_box, chatbot_ui],
        outputs=[chatbot_ui, question_box]
    )

    question_box.submit(
        fn=chatbot,
        inputs=[question_box, chatbot_ui],
        outputs=[chatbot_ui, question_box]
    )

    clear_btn.click(
        fn=lambda: [],
        outputs=chatbot_ui
    )

demo.launch()
