In [3]:
from collections import Counter, defaultdict
from typing import List, Dict, Literal, Union

import re
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset
from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
ds = load_dataset("UniverseTBD/arxiv-abstracts-large")
ds

Generating train split: 100%|██████████| 2292057/2292057 [00:03<00:00, 723035.26 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'],
        num_rows: 2292057
    })
})

In [None]:
for i in range(3):
    print(f"Example {i + 1}:")
    print(ds["train"][i]["abstract"])
    print(ds["train"][i]["categories"])
    print("---" * 20)

Example 1:
  A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions of diphoton pairs produced at the energy of the Large Hadron
Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs
boson are contrasted with those produced from QCD processes at the LHC, showing
that enhanced sensitivity to the signal can be obtained with judicious
selection of events.

hep-p

In [None]:
all_categories = ds["train"]["categories"]
categories_set = set()

# Collect unique labels
for category in all_categories:
    parts = category.split(" ")
    for part in parts:
        topic = part.split(".")[0]
        categories_set.add(topic)

# Sort the labels and print them
sorted_categories = sorted(list(categories_set), key=lambda x: x.lower())
print(f"There are {len(sorted_categories)} unique primary categories in the dataset:")
for category in sorted_categories:
    print(category)

There are 38 unique primary categories in the dataset:
acc-phys
adap-org
alg-geom
ao-sci
astro-ph
atom-ph
bayes-an
chao-dyn
chem-ph
cmp-lg
comp-gas
cond-mat
cs
dg-ga
econ
eess
funct-an
gr-qc
hep-ex
hep-lat
hep-ph
hep-th
math
math-ph
mtrl-th
nlin
nucl-ex
nucl-th
patt-sol
physics
plasm-ph
q-alg
q-bio
q-fin
quant-ph
solv-int
stat
supr-con


In [None]:
# load samples with single label beloging to specific categories
samples = []
CATEGORIES_TO_SELECT = ["astro-ph", "cond-mat", "cs", "math", "physics"]
for s in ds["train"]:
    if len(s["categories"].split(" ")) != 1:
        continue

    cur_category = s["categories"].strip().split(".")[0]
    if cur_category not in CATEGORIES_TO_SELECT:
        continue
    samples.append(s)

    if len(samples) >= 1000:
        break

print(f"Number of samples: {len(samples)}")

Number of samples: 1000
