In [3]:
from collections import Counter, defaultdict
from typing import List, Dict, Literal, Union

import re
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset
from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
ds = load_dataset("UniverseTBD/arxiv-abstracts-large")
ds

Generating train split: 100%|██████████| 2292057/2292057 [00:03<00:00, 723035.26 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'],
        num_rows: 2292057
    })
})

In [None]:
for i in range(3):
    print(f"Example {i + 1}:")
    print(ds["train"][i]["abstract"])
    print(ds["train"][i]["categories"])
    print("---" * 20)

Example 1:
  A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions of diphoton pairs produced at the energy of the Large Hadron
Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs
boson are contrasted with those produced from QCD processes at the LHC, showing
that enhanced sensitivity to the signal can be obtained with judicious
selection of events.

hep-p

In [None]:
all_categories = ds["train"]["categories"]
categories_set = set()

# Collect unique labels
for category in all_categories:
    parts = category.split(" ")
    for part in parts:
        topic = part.split(".")[0]
        categories_set.add(topic)

# Sort the labels and print them
sorted_categories = sorted(list(categories_set), key=lambda x: x.lower())
print(f"There are {len(sorted_categories)} unique primary categories in the dataset:")
for category in sorted_categories:
    print(category)

There are 38 unique primary categories in the dataset:
acc-phys
adap-org
alg-geom
ao-sci
astro-ph
atom-ph
bayes-an
chao-dyn
chem-ph
cmp-lg
comp-gas
cond-mat
cs
dg-ga
econ
eess
funct-an
gr-qc
hep-ex
hep-lat
hep-ph
hep-th
math
math-ph
mtrl-th
nlin
nucl-ex
nucl-th
patt-sol
physics
plasm-ph
q-alg
q-bio
q-fin
quant-ph
solv-int
stat
supr-con


In [None]:
# load samples with single label beloging to specific categories
samples = []
CATEGORIES_TO_SELECT = ["astro-ph", "cond-mat", "cs", "math", "physics"]
for s in ds["train"]:
    if len(s["categories"].split(" ")) != 1:
        continue

    cur_category = s["categories"].strip().split(".")[0]
    if cur_category not in CATEGORIES_TO_SELECT:
        continue
    samples.append(s)

    if len(samples) >= 1000:
        break

print(f"Number of samples: {len(samples)}")

Number of samples: 1000


Tiền xử lý dữ liệu

In [None]:
preprocessed_samples = []
for s in samples:
    abstract = s["abstract"]

    # Remove \n
    abstract = abstract.strip().replace("\n", " ")

    # Remove special character
    abstract = re.sub(r"[^\w\s]", "", abstract)

    # Remove digits
    abstract = re.sub(r"\d+", "", abstract)

    # Remove extra spaces
    abstract = re.sub(r"\s+", " ", abstract).strip()

    # Convert to lower case
    abstract = abstract.lower()

    # for the label, only keep the first part
    parts = s["categories"].split(" ")
    category = parts[0].split(".")[0]

    preprocessed_samples.append({"text": abstract, "label": category})

In [17]:
print(preprocessed_samples)

[{'text': 'the evolution of earthmoon system is described by the dark matter field fluid model proposed in the meeting of division of particle and field american physical society the current behavior of the earthmoon system agrees with this model very well and the general pattern of the evolution of the moonearth system described by this model agrees with geological and fossil evidence the closest distance of the moon to earth was about km at billion years ago which is far beyond the roches limit the result suggests that the tidal friction may not be the primary cause for the evolution of the earthmoon system the average dark matter field fluid constant derived from earthmoon system data is x sm this model predicts that the marss rotation is also slowing with the angular acceleration rate about x rad s', 'label': 'physics'}, {'text': 'we show that a determinant of stirling cycle numbers counts unlabeled acyclic singlesource automata the proof involves a bijection from these automata to

In [18]:
label_to_id = {label: i for i, label in enumerate(sorted_categories)}
id_to_label = {i: label for i, label in enumerate(sorted_categories)}

# Print label to ID mapping
print("Label to ID mapping:")
for label, id_ in label_to_id.items():
    print(f"{label} ---> {id_}")

Label to ID mapping:
acc-phys ---> 0
adap-org ---> 1
alg-geom ---> 2
ao-sci ---> 3
astro-ph ---> 4
atom-ph ---> 5
bayes-an ---> 6
chao-dyn ---> 7
chem-ph ---> 8
cmp-lg ---> 9
comp-gas ---> 10
cond-mat ---> 11
cs ---> 12
dg-ga ---> 13
econ ---> 14
eess ---> 15
funct-an ---> 16
gr-qc ---> 17
hep-ex ---> 18
hep-lat ---> 19
hep-ph ---> 20
hep-th ---> 21
math ---> 22
math-ph ---> 23
mtrl-th ---> 24
nlin ---> 25
nucl-ex ---> 26
nucl-th ---> 27
patt-sol ---> 28
physics ---> 29
plasm-ph ---> 30
q-alg ---> 31
q-bio ---> 32
q-fin ---> 33
quant-ph ---> 34
solv-int ---> 35
stat ---> 36
supr-con ---> 37


In [None]:
X_full = [sample["text"] for sample in preprocessed_samples]
y_full = [label_to_id[sample["label"]] for sample in preprocessed_samples]

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42, stratify=y_full
)
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Training samples: 800
Test samples: 200


Mã hóa văn bản

In [None]:
docs = [
    "I am going to school to study for the final exam.",
    "The weather is nice today and I feel happy.",
    "I love programming in Python and exploring new libraries.",
    "Data science is an exciting field with many opportunities.",
]

bow = CountVectorizer()
vectors = bow.fit_transform(docs)

for i, vec in enumerate(vectors):
    print(f"Document {i + 1}: {vec.toarray()}")

Document 1: [[1 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 2 0 0 0]]
Document 2: [[0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0]]
Document 3: [[0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0]]
Document 4: [[0 1 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1]]
