In [1]:
import os
import findspark
import pyspark

spark_home = os.path.abspath(os.getcwd() + "/../spark-3.5.5-bin-hadoop3")
hadoop_home = os.path.abspath(os.getcwd() + "/../winutils")

if os.name == 'nt':
    os.environ["HADOOP_HOME"] = hadoop_home
    os.environ["PATH"] = os.path.join(hadoop_home, "bin") + ";" + os.environ["PATH"]

findspark.init(spark_home)
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession.builder.getOrCreate()

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType

schema = StructType() \
    .add("aid", StringType()) \
    .add("title", StringType()) \
    .add("summary", StringType()) \
    .add("main_category", StringType()) \
    .add("categories", StringType()) \
    .add("published", StringType())

data_path = "file:///C:/spark_project/spark/notebooks/arxiv_streamed_data/saved_data-*/part-*"
raw_df = spark.read.text(data_path)
json_df = raw_df.select(from_json(col("value"), schema).alias("data")).select("data.*")


json_df = json_df.dropDuplicates(["aid"])


In [2]:
from pyspark.sql.functions import concat_ws, udf, col
from pyspark.sql.types import ArrayType, StringType

json_df = json_df.withColumn("text", concat_ws(" ", col("title"), col("summary")))


def extract_domains(category_str):
    if category_str is None:
        return []
    return list(set([x.strip().split('.')[0].split('-')[0] for x in category_str.split(',')]))

extract_domains_udf = udf(extract_domains, ArrayType(StringType()))
json_df = json_df.withColumn("labels", extract_domains_udf(col("categories")))


In [3]:
pdf = json_df.select("aid", "text", "labels").dropna(subset=["text", "labels"]).toPandas()


In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").eval().to("cuda" if torch.cuda.is_available() else "cpu")

device = model.device

def get_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        cls = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return cls.squeeze().cpu().numpy()

embeddings = np.stack([get_embedding(t) for t in tqdm(pdf["text"])])


100%|████████████████████████████████████████████████████████████████████████████| 6590/6590 [1:01:59<00:00,  1.77it/s]


In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(pdf["labels"])


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
clf.fit(embeddings, Y)

pred = clf.predict(embeddings)
print(classification_report(Y, pred, target_names=mlb.classes_))


              precision    recall  f1-score   support

          62       1.00      1.00      1.00         1
          81       1.00      1.00      1.00         1
          92       1.00      1.00      1.00         1
          94       1.00      1.00      1.00         1
           A       1.00      1.00      1.00         3
           D       1.00      1.00      1.00         1
           F       1.00      1.00      1.00         4
           G       1.00      0.86      0.92        14
           H       1.00      0.82      0.90        11
           I       1.00      0.40      0.57        40
           J       1.00      1.00      1.00         7
           M       1.00      1.00      1.00         1
       astro       1.00      1.00      1.00       457
        cond       0.97      0.95      0.96       567
          cs       0.95      0.96      0.95      3344
        econ       1.00      1.00      1.00        57
        eess       0.91      0.73      0.81       446
          gr       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
from collections import Counter

all_labels = [label for sublist in pdf["labels"] for label in sublist]
label_counts = Counter(all_labels)


In [8]:
import pandas as pd

label_df = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count'])
label_df = label_df.sort_values(by='count', ascending=False)
print(label_df)


         count
cs        3344
math      1427
physics    605
cond       567
astro      457
eess       446
quant      420
hep        406
stat       271
gr         208
q          167
nucl        79
econ        57
nlin        55
I           40
G           14
H           11
J            7
F            4
A            3
62           1
94           1
M            1
81           1
D            1
92           1


In [10]:
valid_labels = [
    'cs', 'math', 'physics', 'cond', 'astro', 'eess',
    'quant', 'hep', 'stat', 'gr', 'q', 'nucl', 'econ', 'nlin'
]

pdf["labels"] = pdf["labels"].apply(lambda lst: [x for x in lst if x in valid_labels])


pdf = pdf[pdf["labels"].map(len) > 0].reset_index(drop=True)


from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(pdf["labels"])


In [11]:

X_filtered = embeddings[pdf.index]  # Same shape as new Y

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
clf.fit(X_filtered, Y)

Y_pred = clf.predict(X_filtered)

print(classification_report(Y, Y_pred, target_names=mlb.classes_, zero_division=0))

import joblib

joblib.dump(clf, "cleaned_classifier.pkl")
joblib.dump(mlb, "label_binarizer.pkl")
np.save("filtered_embeddings.npy", X_filtered)


              precision    recall  f1-score   support

       astro       1.00      1.00      1.00       457
        cond       0.97      0.95      0.96       567
          cs       0.95      0.96      0.95      3344
        econ       1.00      1.00      1.00        57
        eess       0.91      0.73      0.81       446
          gr       1.00      0.98      0.99       208
         hep       0.96      0.92      0.94       406
        math       0.93      0.88      0.91      1427
        nlin       1.00      0.93      0.96        55
        nucl       1.00      0.97      0.99        79
     physics       0.88      0.73      0.80       605
           q       1.00      0.95      0.98       167
       quant       0.98      0.95      0.96       420
        stat       0.99      0.89      0.93       271

   micro avg       0.95      0.91      0.93      8509
   macro avg       0.97      0.92      0.94      8509
weighted avg       0.95      0.91      0.93      8509
 samples avg       0.95   

In [12]:
import joblib
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

clf = joblib.load("cleaned_classifier.pkl")
mlb = joblib.load("label_binarizer.pkl")

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
scibert = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased").eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scibert.to(device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(31090, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [13]:
def predict_article_labels(title: str, summary: str):
    text = f"{title.strip()} {summary.strip()}"
    
    with torch.no_grad():
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        tokens = {k: v.to(device) for k, v in tokens.items()}
        outputs = scibert(**tokens)
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    pred_binary = clf.predict(cls_embedding)
    pred_labels = mlb.inverse_transform(pred_binary)[0]
    
    return pred_labels


In [15]:
def predict_article_labels_with_scores(title: str, summary: str, threshold=0.5):
    text = f"{title.strip()} {summary.strip()}"
    
    with torch.no_grad():
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        tokens = {k: v.to(device) for k, v in tokens.items()}
        outputs = scibert(**tokens)
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    proba = clf.predict_proba(cls_embedding)[0]
    labels = [label for label, p in zip(mlb.classes_, proba) if p >= threshold]
    
    return labels, proba


In [14]:
title = "A new approach to gradient descent methods in high-dimensional optimization"
summary = "We explore novel convergence properties of adaptive learning rates under nonconvex assumptions..."

predicted = predict_article_labels(title, summary)
print("Predicted labels:", predicted)


Predicted labels: ('math', 'stat')


In [16]:
labels, probs = predict_article_labels_with_scores(title, summary, threshold=0.4)

print("Predicted labels:", labels)
for label, p in zip(mlb.classes_, probs):
    print(f"{label:10} → {p:.3f}")


Predicted labels: ['math', 'stat']
astro      → 0.003
cond       → 0.001
cs         → 0.366
econ       → 0.114
eess       → 0.009
gr         → 0.000
hep        → 0.154
math       → 0.721
nlin       → 0.001
nucl       → 0.001
physics    → 0.002
q          → 0.017
quant      → 0.000
stat       → 0.560
