In [1]:
import pandas as pd
import json

In [2]:
LIST_SUBJECT = [ "cs", "math", "stats", "physics", "eess"]

In [7]:
df = pd.read_csv("../dataset/arxiv_merged.csv")
df = df.drop_duplicates(ignore_index=True)
df

Unnamed: 0,title,url,authors,tags,abstract,submitted_date,announced_date
0,Knowledge Graph Construction for Stock Markets...,https://arxiv.org/abs/2601.11528,"Cheonsol Lee,Youngsang Jeong,Jeongyeol Shin,Hu...","[{""tag"": ""cs.DB"", ""name"": ""Databases""}, {""tag""...","The stock market is inherently complex, with i...","16 November, 2025",January 2026
1,SetupKit: Efficient Multi-Corner Setup/Hold Ti...,https://arxiv.org/abs/2512.00044,"Junzhuo Zhou,Ziwen Wang,Haoxuan Xia,Yuxin Yan,...","[{""tag"": ""cs.AR"", ""name"": ""Hardware Architectu...",Accurate setup/hold time characterization is c...,"16 November, 2025",December 2025
2,A General Highly Accurate Online Planning Meth...,https://arxiv.org/abs/2511.21706,"Hui Wang,Fafa Zhang,Xiaoyu Zhang,Chaoxu Mu","[{""tag"": ""cs.CL"", ""name"": ""Computation and Lan...","In goal-oriented dialogue tasks, the main chal...","16 November, 2025",November 2025
3,Insight-A: Attribution-aware for Multimodal Mi...,https://arxiv.org/abs/2511.21705,"Junjie Wu,Yumeng Fu,Chen Gong,Guohong Fu","[{""tag"": ""cs.CL"", ""name"": ""Computation and Lan...",AI-generated content (AIGC) technology has eme...,"16 November, 2025",November 2025
4,On the Cross-lingual Transferability of Pre-tr...,https://arxiv.org/abs/2511.21704,"Jonatas Grosman,Cassio Almeida,Guilherme Schar...","[{""tag"": ""cs.CL"", ""name"": ""Computation and Lan...",Using representations provided by a large pre-...,"16 November, 2025",November 2025
...,...,...,...,...,...,...,...
2926682,Classification of Transuranium Elements in Ter...,https://arxiv.org/abs/2510.14289,Sergei K. Suslov,"[{""tag"": ""quant-ph"", ""name"": ""Quantum Physics""}]",We revisit the Bohr-Sommerfeld atomic model to...,"15 November, 2025",October 2025
2926683,Dominating Hadwiger's Conjecture for graphs GG...,https://arxiv.org/abs/2510.12564,"Michael Scully,Zi-Xia Song","[{""tag"": ""math.CO"", ""name"": ""Combinatorics""}]",Hadwiger's Conjecture from 1943 states that ev...,"15 November, 2025",October 2025
2926684,Fast-forwardable Lindbladians imply quantum ph...,https://arxiv.org/abs/2510.06759,"Zhong-Xia Shang,Naixu Guo,Patrick Rebentrost,A...","[{""tag"": ""quant-ph"", ""name"": ""Quantum Physics""}]",Quantum phase estimation (QPE) and Lindbladian...,"15 November, 2025",October 2025
2926685,Implicit-Knowledge Visual Question Answering w...,https://arxiv.org/abs/2510.06638,"Zhihao Wen,Wenkang Wei,Yuan Fang,Xingtong Yu,H...","[{""tag"": ""cs.CV"", ""name"": ""Computer Vision and...",Knowledge-based Visual Question Answering (KVQ...,"15 November, 2025",October 2025


# Cleaned Text

In [None]:
import os
if not os.path.exists("../dataset/arxiv_dataset_clean.csv"):
    df['text'] = (df['title'].fillna("")+ ". " + df['abstract'].fillna(""))
    df= df.drop(columns=['abstract'])
    import re
    def clean_text(t):
        t = t.lower()
        t = re.sub(r"https?://\S+|www\.\S+", "", t)
        t = re.sub(r"\S+@\S+", "", t)
        t = re.sub(r"[^a-z0-9]", " ", t)
        t = re.sub(r"\b\d+\b", "", t)
        t = re.sub(r"\s+", " ", t)
        return t.strip()
    df['text'] = df["text"].apply(clean_text)
    def parse_tags(tag_str):
        try:
            tags = json.loads(tag_str)
            if isinstance(tags, list):
                return [t["tag"] for t in tags if "tag" in t]
        except Exception:
            return []
        return []


    df["tag_list"] = df["tags"].apply(parse_tags)
    df = df.explode("tag_list").reset_index(drop=True)
    df["prefix"] = df["tag_list"].apply(lambda x: x.split(".")[0] if isinstance(x, str) else None)

    def parse_tags(x):
        try:
            tags = json.loads(x.replace("'", '"'))
            return ", ".join([t["name"] for t in tags])
        except Exception:
            return ""

    df = df.assign(tag_text = df["tags"].apply(parse_tags))

    df = df.drop(columns=['url', 'announced_date', 'tags', 'authors', 'tag_list'])

    df['submitted_date'] = pd.to_datetime(df['submitted_date'], format="%d %B, %Y")
    data_compute = df[(df['submitted_date'] >= "2000-01-01") & (df['submitted_date'] <= "2025-12-31")].reset_index(drop=True)

    data_compute.to_csv("../dataset/arxiv_dataset_clean.csv", index=False)
    
df = pd.read_csv("../dataset/arxiv_dataset_clean.csv")

## Data Sampling

In [5]:
def sampling_per_year(data, frac=0.2, min_per_year=1000):
    data["submitted_date"] = pd.to_datetime(data["submitted_date"], errors="coerce")
    data["year"] = data["submitted_date"].dt.year

    def sample_group(g):
        n = max(int(len(g) * frac), min_per_year)
        n = min(n, len(g))
        return g.sample(n=n, random_state=1337)

    return (
        data.groupby("year", group_keys=False)
            .apply(sample_group, include_groups=False)
            .reset_index(drop=True)
    )

## Dataset Embeding

In [None]:
dataset_emb = df.copy()
for i in ['stat']:
    d = dataset_emb[df['prefix'] == i]
    d = d.drop(columns=['prefix'])
    d = d.drop_duplicates(ignore_index=False)
    print(f"Total data Before Sampling = {len(d)}")
    d = sampling_per_year(d)
    print(f"Total data After Sampling = {len(d)}")
    d.to_csv(f"../dataset/{i}/emb/v1.csv", index=False)

Total data Before Sampling = 122389
Total data After Sampling = 28965


## Dataset Bag of Wods

In [10]:
import spacy
from gensim.models import Phrases, Word2Vec
from gensim.models.phrases import Phraser



nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

custom_blocklist = {
    # 1. ACADEMIC NOUNS 
    'study', 'paper', 'research', 'result', 'conclusion', 'method', 'methodology', 
    'approach', 'analysis', 'implementation', 'evaluation', 'experiment', 'data',
    'problem', 'solution', 'technique', 'application', 'system', 'model', 'user',
    'work', 'case', 'example', 'comparison', 'performance', 'process', 'task',
    'simulation', 'platform', 'framework', 'component', 'parameter', 'value',
    'number', 'time', 'year', 'state', 'art', 'feature', 'challenge', 'issue',
    
    # 2. ACADEMIC ADJ/VERBS 
    'propose', 'use', 'base', 'new', 'exist', 'different', 'differ',
    'various', 'significant', 'good', 'better', 'best', 'high', 'low', 'large', 
    'small', 'simple', 'complex', 'general', 'novel', 'main', 'important',
    'improve', 'demonstrate', 'present', 'show', 'discuss', 'describe', 'provide',
    'consider', 'include', 'achieve', 'obtain', 'perform', 'validate', 'compare',
    'recent', 'current', 'future', 'overall', 'multiple', 'several', 'many',
    
    # 3. CITATION/ARTIFACTS
    'et', 'al', 'fig', 'figure', 'table', 'eq', 'equation', 'vol', 'pp', 'doi',
    'http', 'https', 'url', 'www', 'section', 'ref', 'reference', 'cite'
}

def preprocesing_bow(text):
    doc = nlp(text)
    cleaned_tokens = []
    allowed_tags = ['NOUN', 'ADJ', 'PROPN']
    
    for token in doc:
        lemma = token.lemma_.lower()
        if token.pos_ in allowed_tags and not token.is_stop and len(lemma) > 2 and lemma not in custom_blocklist:
            cleaned_tokens.append(token.lemma_)
    return cleaned_tokens

def pharse_modeling(tokens):
    bigram = Phrases(tokens, min_count=5, threshold=0.5, scoring='npmi')
    trigram = Phrases(bigram[tokens], min_count=5, threshold=0.5, scoring='npmi')
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    _ = [bigram_mod[doc] for doc in tokens]
    tokens_trigram = [trigram_mod[bigram_mod[doc]] for doc in tokens]
    return tokens_trigram


In [None]:
for i in ['stat']:
    d = df[df['prefix'] == i]
    d = d.drop(columns=['prefix'])
    d = d.drop_duplicates(ignore_index=False)
    print(f"Total Data Before Sampling {len(d)}")
    d = sampling_per_year(d)
    print(f"Total Data After Sampling {len(d)}")
    d['text'] = d['text'].apply(preprocesing_bow)
    d['text'] = pharse_modeling(d['text'].tolist())
    print(f"Done preprocess pharse modeling")
    d.to_csv(f"../dataset/{i}/bow/v1.csv", index=False)

Total Data Before Sampling 122389
Total Data After Sampling 28965
Start preprocess lemm, stopwords, pos
Done preprocess lemm, stopwords, pos
Start preprocess pharse modeling
Done preprocess pharse modeling
