In [23]:
import time, random
from urllib.parse import urlencode
import requests
import xml.etree.ElementTree as ET
from typing import List, Dict, Any  # <-- quan trọng

HEADERS = {"User-Agent": "AcademicCrawler/1.0 (+mailto:youremail@example.com)"}
API_ENDPOINT = "https://export.arxiv.org/api/query"

def get_html(url: str, timeout: int = 20) -> str:
    try:
        r = requests.get(url, headers=HEADERS, timeout=timeout)
        return r.text if r.status_code == 200 else ""
    except Exception:
        return ""

def arxiv_api_search(
    query: str,
    max_results: int = 200,
    page_size: int = 100,
    sortBy: str = "submittedDate",
    sortOrder: str = "descending",
    delay_sec: float = 1.0,
) -> List[Dict[str, Any]]:
    entries: List[Dict[str, Any]] = []
    retrieved = 0
    while retrieved < max_results:
        size = min(page_size, max_results - retrieved)
        params = {
            "search_query": query,
            "start": retrieved,
            "max_results": size,
            "sortBy": sortBy,
            "sortOrder": sortOrder,
        }
        xml_text = get_html(f"{API_ENDPOINT}?{urlencode(params)}")
        if not xml_text:
            break

        ns = {"atom": "http://www.w3.org/2005/Atom"}
        root = ET.fromstring(xml_text)
        feed_entries = root.findall("atom:entry", ns)
        if not feed_entries:
            break

        for e in feed_entries:
            arxiv_id = e.find("atom:id", ns).text.rsplit("/", 1)[-1]
            title    = (e.find("atom:title", ns).text or "").strip()
            summary  = (e.find("atom:summary", ns).text or "").strip()
            authors  = [a.find("atom:name", ns).text for a in e.findall("atom:author", ns)]
            pdf_link = ""
            detail   = ""
            for l in e.findall("atom:link", ns):
                if l.attrib.get("title") == "pdf" or l.attrib.get("type") == "application/pdf":
                    pdf_link = l.attrib.get("href", "")
                if l.attrib.get("rel") == "alternate":
                    detail = l.attrib.get("href", "")

            cats = [c.attrib.get("term") for c in e.findall("{http://www.w3.org/2005/Atom}category")]
            updated = (e.find("atom:updated", ns).text or "")
            published = (e.find("atom:published", ns).text or "")

            entries.append({
                "arXiv ID": arxiv_id,
                "Title": title,
                "Authors": ", ".join(authors),
                "Subjects": ", ".join(cats),
                "Subject_Tags": ", ".join(cats),
                "Abstract": summary,
                "Submitted": f"Published: {published} | Updated: {updated}",
                "Detail Link": detail,
                "PDF Link": pdf_link,
            })
        retrieved += len(feed_entries)
        time.sleep(delay_sec)
    return list({e["arXiv ID"]: e for e in entries}.values())


In [30]:
# Nhập query ở đây
query = 'ti:"vision transformer" AND cat:cs.CV'
max_results = 200
page_size   = 100
delay_sec   = 1.2   # nên >= 1.0 để tôn trọng rate limit arXiv

api_results = arxiv_api_search(
    query=query,
    max_results=max_results,
    page_size=page_size,
    delay_sec=delay_sec,
)

print(f"API collected: {len(api_results)}")

API collected: 200


In [38]:
import json, pandas as pd

def save_results(items, csv_path="arxiv_api.csv", jsonl_path="arxiv_api.jsonl"):
    if not items:
        print("No items to save."); return
    # CSV
    df = pd.DataFrame(items)
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    # JSONL
    with open(jsonl_path, "w", encoding="utf-8") as f:
        for row in items:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
    print(f"Saved {len(items)} records -> {csv_path} & {jsonl_path}")

# Xem 3 dòng đầu
pd.DataFrame(api_results).head(5)

Unnamed: 0,arXiv ID,Title,Authors,Subjects,Subject_Tags,Abstract,Submitted,Detail Link,PDF Link
0,2509.23859v1,FairViT-GAN: A Hybrid Vision Transformer with ...,Djamel Eddine Boukhari,cs.CV,cs.CV,Facial Beauty Prediction (FBP) has made signif...,Published: 2025-09-28T12:55:31Z | Updated: 202...,http://arxiv.org/abs/2509.23859v1,http://arxiv.org/pdf/2509.23859v1
1,2509.23751v1,PVTAdpNet: Polyp Segmentation using Pyramid vi...,"Arshia Yousefi Nezhad, Helia Aghaei, Hedieh Sa...","cs.CV, cs.AI","cs.CV, cs.AI",Colorectal cancer ranks among the most common ...,Published: 2025-09-28T08:55:50Z | Updated: 202...,http://arxiv.org/abs/2509.23751v1,http://arxiv.org/pdf/2509.23751v1
2,2509.23235v1,Patch Rebirth: Toward Fast and Transferable Mo...,"Seongsoo Heo, Dong-Wan Choi","cs.CV, cs.AI","cs.CV, cs.AI",Model inversion is a widely adopted technique ...,Published: 2025-09-27T10:35:44Z | Updated: 202...,http://arxiv.org/abs/2509.23235v1,http://arxiv.org/pdf/2509.23235v1
3,2509.21084v1,Vision Transformers: the threat of realistic a...,"Kasper Cools, Clara Maathuis, Alexander M. van...","cs.CV, cs.AI","cs.CV, cs.AI",The increasing reliance on machine learning sy...,Published: 2025-09-25T12:36:25Z | Updated: 202...,http://arxiv.org/abs/2509.21084v1,http://arxiv.org/pdf/2509.21084v1
4,2509.20986v2,SiNGER: A Clearer Voice Distills Vision Transf...,"Geunhyeok Yu, Sunjae Jeong, Yoonyoung Choi, Ja...","cs.CV, cs.AI","cs.CV, cs.AI",Vision Transformers are widely adopted as the ...,Published: 2025-09-25T10:29:47Z | Updated: 202...,http://arxiv.org/abs/2509.20986v2,http://arxiv.org/pdf/2509.20986v2


In [32]:
# Lưu file
save_results(api_results, "arxiv_api.csv", "arxiv_api.jsonl")

Saved 200 records -> arxiv_api.csv & arxiv_api.jsonl


In [33]:
import json
from collections import OrderedDict
from datetime import datetime

def to_display_date(iso_str: str) -> str:
    """'2023-12-01T12:34:56Z' -> '1 December, 2023'"""
    try:
        dt = datetime.strptime(iso_str[:19], "%Y-%m-%dT%H:%M:%S")
        return dt.strftime("%-d %B, %Y")
    except Exception:
        return iso_str

def remap_item(item: dict) -> OrderedDict:
    """
    ta tách ngày 'Published' để đổ vào 'Submitted Date'.
    """
    submitted_date = item.get("Submitted", "")
    if "Published:" in submitted_date:
        # tách 'Published: ISO | Updated: ISO'
        try:
            published_iso = submitted_date.split("Published:")[1].split("|")[0].strip()
            submitted_date = to_display_date(published_iso)
        except Exception:
            pass

    return OrderedDict([
        ("arXiv ID",   item.get("arXiv ID", "")),
        ("PDF Link",   item.get("PDF Link", "")),
        ("Subject_Tags", item.get("Subject_Tags", "")),
        ("Subjects",   item.get("Subjects", "")),
        ("Title",      item.get("Title", "")),
        ("Authors",    item.get("Authors", "")),
        ("Abstract",   item.get("Abstract", "")),
        ("Submitted Date", submitted_date),
    ])

def save_pretty_json(items, path="arxiv_pretty.json"):
    # Sắp xếp/đặt lại khóa theo mong muốn trước khi ghi
    remapped = [remap_item(x) for x in items]
    with open(path, "w", encoding="utf-8") as f:
        json.dump(remapped, f, ensure_ascii=False, indent=2)
    print(f"Saved pretty JSON -> {path}")

# GỌI HÀM (api_results là list bạn đã có)
save_pretty_json(api_results, "arxiv_pretty.json")


Saved pretty JSON -> arxiv_pretty.json


# **TEXT PRE_PROCESSING**

In [34]:
import nltk, pkgutil

print("NLTK version:", nltk.__version__)

# Tải các gói cần thiết. 'punkt_tab' có ở NLTK >= 3.8, nên tải có kiểm tra.
for pkg in ["punkt", "stopwords", "wordnet", "omw-1.4"]:
    nltk.download(pkg, quiet=True)

# Thử tải punkt_tab nếu có trong index
try:
    nltk.download("punkt_tab", quiet=True)
except Exception as e:
    print("punkt_tab not available on this NLTK; continuing...")

print("NLTK data ready.")


NLTK version: 3.9.1
NLTK data ready.


In [35]:
import re, string
from typing import List, Dict, Any
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# 1) Tokenize
def tokenize_text(text: str) -> List[str]:
    text = (text or "").lower()
    # bỏ URL/doi/links để sạch hơn (tuỳ chọn)
    text = re.sub(r'https?://\S+|doi:\S+', ' ', text)
    return word_tokenize(text)

# 2) Clean: bỏ punctuation & stopwords & số (tuỳ chọn)
def clean_tokens(tokens: List[str]) -> List[str]:
    sw = set(stopwords.words('english'))
    punct = set(string.punctuation)
    cleaned = []
    for t in tokens:
        if t in punct:
            continue
        if t in sw:
            continue
        if t.isnumeric():  # bỏ token là số thuần
            continue
        cleaned.append(t)
    return cleaned

# 3) Normalize: lemmatize hoặc stem
def normalize_tokens(tokens: List[str], option: str):
    if option == 'l':
        lem = WordNetLemmatizer()
        return [lem.lemmatize(t) for t in tokens]
    elif option == 's':
        stem = PorterStemmer()
        return [stem.stem(t) for t in tokens]
    else:
        raise ValueError("Invalid option. Use 'l' for lemmatization or 's' for stemming.")

# 4) Full pipeline cho một chuỗi
def preprocess_text(text: str, normalize: str = 'l') -> str:
    tokens = tokenize_text(text)
    cleaned = clean_tokens(tokens)
    normalized = normalize_tokens(cleaned, option=normalize)
    return ' '.join(normalized)

# 5) Gộp metadata paper thành text rồi xử lý
def preprocess_paper(paper: Dict[str, Any], normalize: str = 'l') -> str:
    # Tương thích cả hai kiểu khóa: 'Submitted Date' (pretty JSON) hoặc 'Submitted' (API gốc)
    submitted = paper.get('Submitted Date') or paper.get('Submitted') or ""
    text_meta = " ".join([
        paper.get('Title', ''),
        paper.get('Authors', ''),
        paper.get('Abstract', ''),
        paper.get('Subject_Tags', ''),
        paper.get('Subjects', ''),
        submitted
    ])
    return preprocess_text(text_meta, normalize=normalize)

In [15]:
#lấy bài theo keyword
keyword = "transformer"  # đổi theo ý bạn
paper = next((p for p in api_results if keyword.lower() in p.get("Title","").lower()), None)
if paper:
    print("arXiv ID:", paper.get("arXiv ID"))
    print("Title   :", paper.get("Title"))
else:
    print("Không tìm thấy bài nào chứa từ khóa:", keyword)


arXiv ID: 2509.24080v1
Title   : Ensembling Multilingual Transformers for Robust Sentiment Analysis of   Tweets


In [36]:
import json, random, os

# ---- 1) Load file JSON ----
json_path = "arxiv_pretty.json"  # đổi nếu bạn lưu tên khác
assert os.path.exists(json_path), f"Không tìm thấy file: {json_path}"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

assert isinstance(data, list) and len(data) > 0, "File JSON phải là một list các paper (ít nhất 1 phần tử)."

# ---- 2) Chọn 1 bài làm ví dụ ----
# Cách A: chọn theo index
idx = 0  # đổi số này để chọn bài khác, ví dụ 5, 10, ...
paper = data[idx]

# (Tuỳ chọn) Cách B: chọn theo từ khóa trong Title
# keyword = "transformer"
# paper = next((p for p in data if keyword.lower() in p.get("Title","").lower()), data[0])

print("arXiv ID:", paper.get("arXiv ID"))
print("Title   :", paper.get("Title"))

arXiv ID: 2509.23859v1
Title   : FairViT-GAN: A Hybrid Vision Transformer with Adversarial Debiasing for   Fair and Explainable Facial Beauty Prediction


In [37]:

# ---- 3) Hiển thị từng bước xử lý chỉ cho TITLE ----
title = paper.get("Title", "")

# B1: tokenize
tokens = tokenize_text(title)
print("\n--- Tokens ---")
print(tokens)
print("Total tokens:", len(tokens))

# B2: clean
cleaned = clean_tokens(tokens)
print("\n--- Cleaned Tokens ---")
print(cleaned)
print("Total cleaned:", len(cleaned))

# B3: normalize
lemmatized = normalize_tokens(cleaned, option='l')  # lemmatization
stemmed    = normalize_tokens(cleaned, option='s')  # stemming
print("\n--- Lemmatized ---")
print(lemmatized)
print("\n--- Stemmed ---")
print(stemmed)

# B4: full preprocess cho TITLE
final_title = preprocess_text(title, normalize='l')  # đổi 's' nếu muốn stem
print("\n=== Final Preprocessed Title (normalize='l') ===")
print(final_title)


--- Tokens ---
['fairvit-gan', ':', 'a', 'hybrid', 'vision', 'transformer', 'with', 'adversarial', 'debiasing', 'for', 'fair', 'and', 'explainable', 'facial', 'beauty', 'prediction']
Total tokens: 16

--- Cleaned Tokens ---
['fairvit-gan', 'hybrid', 'vision', 'transformer', 'adversarial', 'debiasing', 'fair', 'explainable', 'facial', 'beauty', 'prediction']
Total cleaned: 11

--- Lemmatized ---
['fairvit-gan', 'hybrid', 'vision', 'transformer', 'adversarial', 'debiasing', 'fair', 'explainable', 'facial', 'beauty', 'prediction']

--- Stemmed ---
['fairvit-gan', 'hybrid', 'vision', 'transform', 'adversari', 'debias', 'fair', 'explain', 'facial', 'beauti', 'predict']

=== Final Preprocessed Title (normalize='l') ===
fairvit-gan hybrid vision transformer adversarial debiasing fair explainable facial beauty prediction
