# Installing Packages

In [1]:
!pip install streamlit langgraph langchain_huggingface transformers peft datasets torch accelerate

Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting langgraph
  Downloading langgraph-1.0.2-py3-none-any.whl.metadata (7.4 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-1.0.0-py3-none-any.whl.metadata (2.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting langgraph-checkpoint<4.0.0,>=2.1.0 (from langgraph)
  Downloading langgraph_checkpoint-3.0.0-py3-none-any.whl.metadata (4.2 kB)
Collecting langgraph-prebuilt<1.1.0,>=1.0.2 (from langgraph)
  Downloading langgraph_prebuilt-1.0.2-py3-none-any.whl.metadata (5.0 kB)
Collecting langgraph-sdk<0.3.0,>=0.2.2 (from langgraph)
  Downloading langgraph_sdk-0.2.9-py3-none-any.whl.metadata (1.5 kB)
Collecting langchain-core>=0.1 (from langgraph)
  Downloading langchain_core-1.0.2-py3-none-any.whl.metadata (3.5 kB)
Collecting ormsgpack>=1.10.0 (from langgraph-checkpoint<4.0.0,>=2.1.0->langgraph)
  Dow

In [2]:
!pip install rouge-score bert-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=3459861dc98423fcd1e74b275d2750940ccc167b5e44bad2298405c4b8ccc1f9
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score, bert-score
Successfully installed bert-score-0.3.13 rouge-score-0.1.2


In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


# Importing Libraries


In [4]:
from langgraph.graph import StateGraph, START, END
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from typing import TypedDict, Dict, List
import requests
import os
import re

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from rouge_score import rouge_scorer
from bert_score import score as bert_score

### MODEL INITIALIZATION

In [38]:
def initialize_llm(api_key: str):
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
    llm = HuggingFaceEndpoint(
        endpoint_url="openai/gpt-oss-120b",
        task="text-generation",
        huggingfacehub_api_token=api_key
    )
    return ChatHuggingFace(llm=llm)


# STATE STRUCTURE
class PaperInfo(TypedDict):
    prompt: str
    topic: List[str]
    top_search: int
    title: List[str]
    abstract: List[str]
    url: List[str]
    citationCount: List[int]
    result: str
    model: object

### TITLE GENERATION


In [20]:
def generate_titles(Info: PaperInfo) -> PaperInfo:
    base_topic = Info['prompt']
    prompt = f"""
Generate exactly {Info['top_search']} unique research paper title ideas
that are all directly related to the topic: "{base_topic}".
Each title must be academic-sounding and focused on different
aspects (methods, challenges, applications, or improvements)
within this same topic.
Format each title as:
1. Title text
2. Title text
and so on.
    """
    response = Info['model'].invoke(prompt)
    text = response.content

    titles = [re.sub(r'^\d+\.\s*', '', line).strip() for line in text.split('\n') if line.strip()]
    titles = [t for t in titles if len(t) > 3][:Info['top_search']]
    Info['topic'] = titles
    return Info

### PAPER FETCH


In [21]:
def get_papers(Info: PaperInfo) -> PaperInfo:
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    main_topic = Info['prompt']

    for i in range(min(Info['top_search'], len(Info['topic']))):
        search_query = f"{main_topic} {Info['topic'][i]}"
        params = {
            "query": search_query,
            "fields": "title,url,abstract,citationCount",
            "limit": 1,
            "offset": 0
        }
        response = requests.get(url, params=params)
        data = response.json()

        for paper in data.get("data", []):
            Info["abstract"].append(paper.get("abstract"))
            Info["title"].append(paper.get("title"))
            Info["url"].append(paper.get("url"))
            Info["citationCount"].append(paper.get("citationCount"))
    return Info

### SUMMARIZATION


In [22]:
def draft_answer(Info: PaperInfo) -> PaperInfo:
    prompt = f"""Using the following research papers, create a detailed summary for each paper on the topic '{Info['prompt']}'.
Each summary should be around 150 words, clearly explaining the paper's purpose, methods, and key findings.

"""
    for i in range(len(Info['title'])):
        prompt += f"Title: {Info['title'][i]}\n"
        prompt += f"Abstract: {Info['abstract'][i]}\n"
        prompt += f"URL: {Info['url'][i]}\n"
        prompt += f"Citations: {Info['citationCount'][i]}\n\n"
    prompt += "Now write a detailed 150-word summary for each paper.\nFormat:\nTitle:\nCitations:\nAbstract Summary (~200 words):\nURL:\n"

    if hasattr(Info['model'], 'invoke'):
        response = Info['model'].invoke(prompt)
        Info['result'] = response.content
    else:
        tokenizer = Info['model'].tokenizer
        model = Info['model'].model
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
        outputs = model.generate(**inputs, max_new_tokens=400, temperature=0.7)
        Info['result'] = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return Info

### GRAPH SETUP


In [23]:
graph = StateGraph(PaperInfo)
graph.add_node("generate_titles", generate_titles)
graph.add_node("get_papers", get_papers)
graph.add_node("draft_answer", draft_answer)
graph.add_edge(START, "generate_titles")
graph.add_edge("generate_titles", "get_papers")
graph.add_edge("get_papers", "draft_answer")
graph.add_edge("draft_answer", END)
research_paper_graph = graph.compile()

# FINE-TUNING


In [24]:
def fine_tune_model(base_model="google/flan-t5-base", dataset_name="scientific_papers", output_dir="./finetuned_model"):
    print("Loading dataset...")
    dataset = load_dataset(dataset_name, "pubmed")

    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForSeq2SeqLM.from_pretrained(base_model)

    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q", "v"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_2_SEQ_LM"
    )
    model = get_peft_model(model, lora_config)

    def preprocess_function(examples):
        inputs = ["summarize: " + doc for doc in examples["article"]]
        model_inputs = tokenizer(inputs, max_length=512, truncation=True)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples["abstract"], max_length=150, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-4,
        per_device_train_batch_size=2,
        num_train_epochs=1,
        weight_decay=0.01,
        save_total_limit=1,
        logging_dir='./logs',
        logging_steps=10
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"].select(range(1000)),
        eval_dataset=tokenized_datasets["validation"].select(range(200)),
    )

    trainer.train()
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("Fine-tuning complete. Model saved at", output_dir)
    return output_dir

In [25]:
# LOAD FINE-TUNED MODEL
def load_finetuned_model(path: str):
    model = AutoModelForSeq2SeqLM.from_pretrained(path)
    tokenizer = AutoTokenizer.from_pretrained(path)

    class FineTunedModel:
        def invoke(self, text):
            inputs = tokenizer(text, return_tensors="pt", truncation=True)
            outputs = model.generate(**inputs, max_length=200)
            result = tokenizer.decode(outputs[0], skip_special_tokens=True)
            return type("Response", (), {"content": result})

    return FineTunedModel()

### Rationale for Choosing Fine-Tuning Target

**Model:** google/flan-t5-base

**Method:** LoRA fine-tuning

**Dataset:** Scientific Papers (PubMed subset)

The base model, Flan-T5, performs well on general summarization but lacks domain knowledge for technical research papers. Fine-tuning it on PubMed abstracts helps the model learn academic structure, terminology, and factual summarization style.

This setup ensures the model:

* Focuses on **key findings and methods** instead of surface-level details.
* Writes in a **formal, research-oriented tone**.
* Achieves higher **accuracy and coherence** in summaries.

LoRA is used for efficiency, as it updates only small parameter sets—saving compute while maintaining quality.
Overall, this approach creates a lightweight, domain-adapted model optimized for **scientific paper summarization** tasks.


# EVALUATION FUNCTION

In [26]:
def evaluate_summary(generated_summary: str, reference_text: str):
    """
    Computes ROUGE and BERTScore metrics between generated and reference summaries.
    """
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge.score(reference_text, generated_summary)

    P, R, F1 = bert_score([generated_summary], [reference_text], lang="en", verbose=False)
    bert = {
        "Precision": P.mean().item(),
        "Recall": R.mean().item(),
        "F1": F1.mean().item()
    }

    return {"ROUGE": rouge_scores, "BERTScore": bert}


#MAIN EXECUTION


In [39]:
if __name__ == "__main__":
    api_key = "key"
    use_finetuned = False  # Toggle True to use your LoRA fine-tuned model

    if use_finetuned:
        model = load_finetuned_model("./finetuned_model")
    else:
        model = initialize_llm(api_key)

    topic = input("Enter your research topic: ").strip()
    top_n = int(input("Enter how many papers to summarize (e.g., 3): "))

    initial_info: PaperInfo = {
        "prompt": topic,
        "topic": [],
        "top_search": top_n,
        "title": [],
        "abstract": [],
        "url": [],
        "citationCount": [],
        "result": "",
        "model": model,
    }

    result = research_paper_graph.invoke(initial_info)

    print("\n\nSUMMARIZED RESULT")
    safe_output = result["result"].encode("utf-8", errors="ignore").decode("utf-8")
    print(safe_output)


    print("\n\nEVALUATION METRICS")
    reference_text = input("\nEnter reference text (true summary for evaluation):\n")
    metrics = evaluate_summary(safe_output, reference_text)
    print(metrics)

Enter your research topic: Transformers in Machine Learning
Enter how many papers to summarize (e.g., 3): 3


SUMMARIZED RESULT
**Title:** Attention Is All You Need  
**Citations:** Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, Ł., & Polosukhin, I. (2017). *Advances in Neural Information Processing Systems*, 30.  
**Abstract Summary (~150‑200 words):**  
This seminal work introduces the Transformer architecture, which dispenses with recurrence and convolution in favor of a pure attention mechanism. The authors propose multi‑head self‑attention to capture relationships between all token pairs in a sequence, coupled with position‑wise feed‑forward networks, residual connections, and layer normalization. Positional encodings inject order information. By stacking encoder and decoder blocks, the model achieves state‑of‑the‑art results on machine translation benchmarks (WMT 2014 English‑German and English‑French) while dramatically reducing training ti

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'ROUGE': {'rouge1': Score(precision=0.17096774193548386, recall=0.7162162162162162, fmeasure=0.2760416666666667), 'rouge2': Score(precision=0.03336921420882669, recall=0.14027149321266968, fmeasure=0.05391304347826087), 'rougeL': Score(precision=0.06451612903225806, recall=0.2702702702702703, fmeasure=0.10416666666666666)}, 'BERTScore': {'Precision': 0.7817198038101196, 'Recall': 0.8515523672103882, 'F1': 0.8151431679725647}}
