In [None]:



!pip install requests feedparser pymupdf sentence-transformers qdrant-client

import requests
import fitz  # PyMuPDF
import re
import feedparser
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from uuid import uuid4

!pip install requests feedparser pymupdf sentence-transformers qdrant-client

import requests
import fitz  # PyMuPDF
import re
import feedparser
import json
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from uuid import uuid4
from google.colab import drive
import os

# ==============================
# 0. Mount Google Drive
# ==============================
# Check if Google Drive is already mounted
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Create a folder in Drive if it doesn't exist
folder_path = "/content/drive/MyDrive/Arxiv"
os.makedirs(folder_path, exist_ok=True)

# ==============================
# 1. Get Arxiv Metadata & PDF
# ==============================
def get_arxiv_metadata(arxiv_id):
    url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
    feed = feedparser.parse(url)
    entry = feed.entries[0]
    metadata = {
        "title": entry.title,
        "authors": [author.name for author in entry.authors],
        "published": entry.published,
        "summary": entry.summary,
        "categories": [tag['term'] for tag in entry.tags],
        "arxiv_id": arxiv_id
    }
    return metadata

def download_arxiv_pdf(arxiv_id, filename="paper.pdf"):
    pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(pdf_url)
    with open(filename, "wb") as f:
        f.write(response.content)
    print(f"Downloaded PDF: {filename}")

# ==============================
# 2. Extract Text from PDF
# ==============================
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# ==============================
# 3. Split into Sections
# ==============================
def split_sections(text):
    sections = {}
    current_section = None
    for line in text.split("\n"):
        line_clean = line.strip().lower()
        if re.match(r'^(abstract|introduction|methodology|methods|results|conclusion)', line_clean):
            current_section = line.strip()
            sections[current_section] = []
        elif current_section:
            sections[current_section].append(line.strip())

    for key in sections:
        sections[key] = " ".join(sections[key])
    return sections

# ==============================
# 4. Chunk Sections
# ==============================
def chunk_text(text, max_words=1000):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i:i+max_words]))
    return chunks

# ==============================
# 5. Save JSON to Google Drive
# ==============================
def save_chunks_to_json(sections, metadata, filename):
    data = []
    for sec_name, sec_text in sections.items():
        for chunk in chunk_text(sec_text):
            data.append({
                "section": sec_name,
                "text": chunk,
                "metadata": metadata
            })

    json_path = f"/content/drive/MyDrive/Arxiv/{filename}"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Saved chunks to {json_path}")

# ==============================
# 6. (Optional) Save to Qdrant
# ==============================
def save_to_qdrant(collection_name, sections, metadata):
    client = QdrantClient(":memory:")

    client.recreate_collection(
        collection_name=collection_name,
        vector_size=384,
        distance="Cosine"
    )

    model = SentenceTransformer("all-MiniLM-L6-v2")
    points = []

    for sec_name, sec_text in sections.items():
        for chunk in chunk_text(sec_text):
            vector = model.encode(chunk).tolist()
            points.append(
                PointStruct(
                    id=str(uuid4()),
                    vector=vector,
                    payload={
                        "section": sec_name,
                        "text": chunk,
                        "metadata": metadata
                    }
                )
            )

    client.upsert(collection_name=collection_name, points=points)
    print(f"Saved {len(points)} chunks to Qdrant.")

# ==============================
# Run the Pipeline
# ==============================
arxiv_id = "2507.02554"
metadata = get_arxiv_metadata(arxiv_id)
download_arxiv_pdf(arxiv_id)
text = extract_text_from_pdf("paper.pdf")
sections = split_sections(text)

# Save JSON to Google Drive
save_chunks_to_json(sections, metadata, f"{arxiv_id}_chunks.json")

# this code is for multiple pdf analysis


!pip install requests feedparser pymupdf sentence-transformers

import requests
import fitz
import re
import feedparser
import json
import os
from google.colab import drive

# =======================
# MOUNT GOOGLE DRIVE
# =======================
# Check if Google Drive is already mounted
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Create a folder in Drive
folder_path = "/content/drive/MyDrive/Arxiv"
os.makedirs(folder_path, exist_ok=True)

# =======================
# FUNCTIONS
# =======================
def get_arxiv_metadata(arxiv_id):
    url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
    feed = feedparser.parse(url)
    entry = feed.entries[0]
    metadata = {
        "title": entry.title,
        "authors": [author.name for author in entry.authors],
        "published": entry.published,
        "summary": entry.summary,
        "categories": [tag['term'] for tag in entry.tags],
        "arxiv_id": arxiv_id
    }
    return metadata

def download_arxiv_pdf(arxiv_id, filename):
    pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    response = requests.get(pdf_url)
    with open(filename, "wb") as f:
        f.write(response.content)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def split_sections(text):
    sections = {}
    current_section = None
    for line in text.split("\n"):
        line_clean = line.strip().lower()
        if re.match(r'^(abstract|introduction|methodology|methods|results|conclusion)', line_clean):
            current_section = line.strip()
            sections[current_section] = []
        elif current_section:
            sections[current_section].append(line.strip())
    for key in sections:
        sections[key] = " ".join(sections[key])
    return sections

def chunk_text(text, max_words=1000):
    words = text.split()
    return [" ".join(words[i:i+max_words]) for i in range(0, len(words), max_words)]

def save_chunks_to_json(sections, metadata, filename):
    data = []
    for sec_name, sec_text in sections.items():
        for chunk in chunk_text(sec_text):
            data.append({
                "section": sec_name,
                "text": chunk,
                "metadata": metadata
            })
    json_path = f"{folder_path}/{filename}"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"✅ Saved: {json_path}")

# =======================
# MAIN LOOP — Multiple IDs
# =======================
arxiv_ids = [
    "2508.09116", "2508.09100", "2508.09099", "2508.09097", "2508.09093",
    "2508.09069", "2508.09059", "2508.09056", "2508.09005", "2508.08985",
    "2508.08966", "2508.08955", "2508.08954", "2508.08947", "2508.08935",
    "2508.08920", "2508.08919", "2508.08883", "2508.08804", "2508.08762",
    "2508.08724", "2508.09129", "2508.09123", "2508.09105", "2508.00081",
    "2508.00106", "2508.00116", "2507.12314", "2507.12318", "2507.12329",
    "2507.12359", "2507.12366", "2507.12367", "2507.12379", "2507.12412",
    "2507.12414", "2507.12416", "2507.12419", "2507.12425", "2507.12427",
    "2507.12428", "2507.12442", "2507.12443", "2507.12451", "2507.12461",
    "2507.12475", "2507.12480", "2507.12482", "2507.12485", "2507.12486",
    "2507.12490", "2507.12492", "2507.12496", "2507.12504", "2507.12507",
    "2507.12508", "2507.12553", "2507.12555", "2507.12568", "2507.12574",
    "2507.12602", "2507.12612", "2507.12619", "2507.12630", "2507.12642",
    "2507.12644", "2507.12659", "2507.12665", "2507.12669", "2507.12674",
    "2507.12675", "2507.12774", "2507.12784", "2507.12795", "2507.12803",
    "2507.12805", "2507.12816", "2507.12828", "2507.12845", "2507.12846",
    "2507.12856", "2507.12871", "2507.12898", "2507.12904", "2507.12916",
    "2507.12930", "2507.12933", "2507.12935", "2507.12951", "2507.12961",
    "2507.12964", "2507.12979", "2507.12981", "2507.12990", "2507.13001",
    "2507.13019", "2507.13090", "2507.13097", "2507.13145", "2507.13152"
]


for arxiv_id in arxiv_ids:
    try:
        print(f"📄 Processing {arxiv_id}...")
        metadata = get_arxiv_metadata(arxiv_id)
        pdf_path = f"/content/{arxiv_id}.pdf"
        download_arxiv_pdf(arxiv_id, pdf_path)
        text = extract_text_from_pdf(pdf_path)
        sections = split_sections(text)
        save_chunks_to_json(sections, metadata, f"{arxiv_id}_chunks.json")
    except Exception as e:
        print(f"❌ Failed for {arxiv_id}: {e}")

"""# *Now we will be sending the metadata to Qdrant database*"""

!pip install qdrant-client sentence-transformers

import json
import os
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from uuid import uuid4
from google.colab import drive

# ========================
# 1. Mount Google Drive
# ========================
# Check if Google Drive is already mounted
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

folder_path = "/content/drive/MyDrive/Arxiv"  # Folder with your JSON files

# ========================
# 2. Connect to Qdrant Cloud
# ========================
QDRANT_URL = "https://dfeda460-4577-408a-a202-7ed66f66d5d0.us-west-1-0.aws.cloud.qdrant.io:6333"
QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.ry2lAtmCfc8uLDuRe5Xh3O5qjIIUh0HNZIjcxAuZMZE"  # Removed extra space

client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# ========================
# 3. Create collection
# ========================
COLLECTION_NAME = "ArxivPapers"

client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE) # all-MiniLM-L6-v2 output size
)

# ========================
# 4. Load model
# ========================
model = SentenceTransformer("all-MiniLM-L6-v2")

# ========================
# 5. Upload all JSON chunks
# ========================
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)

        with open(file_path, "r", encoding="utf-8") as f:
            chunks = json.load(f)

        points = []
        for chunk in chunks:
            vector = model.encode(chunk["text"]).tolist()
            points.append(
                PointStruct(
                    id=str(uuid4()),
                    vector=vector,
                    payload=chunk  # stores section, text, metadata
                )
            )

        if points: # Only upsert if there are points
            client.upsert(collection_name=COLLECTION_NAME, points=points)
            print(f"✅ Uploaded {len(points)} chunks from {filename}")
        else:
            print(f"Skipped uploading from {filename}: No chunks found.")


print("🚀 All papers uploaded to Qdrant!")

"""## Now is the retriveal part"""

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance

# ========================
# 2. Connect to Qdrant Cloud
# ========================
QDRANT_URL = "https://dfeda460-4577-408a-a202-7ed66f66d5d0.us-west-1-0.aws.cloud.qdrant.io:6333"
QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.ry2lAtmCfc8uLDuRe5Xh3O5qjIIUh0HNZIjcxAuZMZE"

client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# ========================
# 3. Define Collection Name
# ========================
COLLECTION_NAME = "ArxivPapers"


model = SentenceTransformer("all-MiniLM-L6-v2")

query = " which are the best ai model?"
query_vector = model.encode(query).tolist()

results = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=query_vector,
    limit=3
)

for r in results:
    print(r.payload["section"], ":", r.payload["text"][:200], "...")

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting qdrant-client
  Downloading qdrant_client-1.15.1-py3-none-any.whl.metadata (11 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-tran

  client.recreate_collection(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Uploaded 15 chunks from 2507.02554_chunks.json
✅ Uploaded 9 chunks from 2508.09116_chunks.json
✅ Uploaded 9 chunks from 2508.09100_chunks.json
✅ Uploaded 17 chunks from 2508.09099_chunks.json
✅ Uploaded 12 chunks from 2508.09097_chunks.json
✅ Uploaded 12 chunks from 2508.09093_chunks.json
✅ Uploaded 15 chunks from 2508.09069_chunks.json
✅ Uploaded 4 chunks from 2508.09059_chunks.json
✅ Uploaded 6 chunks from 2508.09056_chunks.json
✅ Uploaded 11 chunks from 2508.09005_chunks.json
✅ Uploaded 10 chunks from 2508.08985_chunks.json
✅ Uploaded 18 chunks from 2508.08966_chunks.json
✅ Uploaded 6 chunks from 2508.08955_chunks.json
✅ Uploaded 9 chunks from 2508.08954_chunks.json
✅ Uploaded 16 chunks from 2508.08947_chunks.json
✅ Uploaded 4 chunks from 2508.08935_chunks.json
✅ Uploaded 6 chunks from 2508.08920_chunks.json
✅ Uploaded 13 chunks from 2508.08919_chunks.json
✅ Uploaded 17 chunks from 2508.08883_chunks.json
✅ Uploaded 16 chunks from 2508.08804_chunks.json
Skipped uploading from 2508.

  results = client.search(


methods that typically decrease performance when masking, MUPAX not only : preserves but actually enhances model accuracy by capturing only the most im- portant patterns of the original data. Extensive benchmarking against the state of the XAI art demonstrates MUPAX’s abilit ...
Introduction : neural networks: a generator that creates fake data and a discriminator that tries to tell real from fake. Both compete in a game, trying to improve their performance over time. As the generator learn ...
results (Chan et al., 2025). Second, AIRA-dojo enables users to experiment with custom operators, search : MLE-Bench. Our results fall within their reported standard deviation. We note that their experiments use at most 6 seeds, and as discussed in Appendix H, the limited number of seeds may introduce vari ...
