# Preparing the data

In [1]:
import json
from collections import Counter
from dataclasses import dataclass

import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
#data from https://www.kaggle.com/neelshah18/arxivdataset/

with open('arxivData.json') as f:
    arxiv_data = json.load(f)

In [3]:
@dataclass
class Paper:
    title: str
    summary: str
    tags: list[str]

In [4]:
papers = []
for paper_json in arxiv_data:
    tags = [i['term'] for i in eval(paper_json['tag'])]
    paper = Paper(paper_json['title'], paper_json['summary'], tags)
    papers.append(paper)

In [5]:
tags_frequency = Counter(sum([paper.tags for paper in papers], []))

In [6]:
tags_frequency.most_common(66)

[('cs.CV', 13902),
 ('cs.LG', 13735),
 ('cs.AI', 10481),
 ('stat.ML', 10326),
 ('cs.CL', 6417),
 ('cs.NE', 3819),
 ('cs.IR', 1443),
 ('math.OC', 1020),
 ('cs.RO', 973),
 ('cs.LO', 643),
 ('cs.SI', 639),
 ('cs.DS', 570),
 ('cs.IT', 543),
 ('math.IT', 543),
 ('q-bio.NC', 513),
 ('stat.ME', 458),
 ('cs.HC', 434),
 ('cs.CR', 411),
 ('cs.DC', 404),
 ('cs.SD', 389),
 ('cs.CY', 376),
 ('stat.AP', 360),
 ('cs.MM', 345),
 ('math.ST', 336),
 ('stat.TH', 336),
 ('cs.DB', 327),
 ('cs.GT', 318),
 ('I.2.7', 313),
 ('physics.soc-ph', 293),
 ('cs.CE', 285),
 ('cs.SY', 270),
 ('cs.MA', 268),
 ('stat.CO', 260),
 ('cs.NA', 253),
 ('q-bio.QM', 232),
 ('cs.GR', 225),
 ('cs.CC', 196),
 ('physics.data-an', 187),
 ('cs.SE', 180),
 ('math.NA', 172),
 ('math.PR', 144),
 ('quant-ph', 142),
 ('cs.DL', 139),
 ('cs.NI', 137),
 ('I.2.6', 131),
 ('cs.PL', 128),
 ('cond-mat.dis-nn', 126),
 ('nlin.AO', 119),
 ('cmp-lg', 110),
 ('cs.DM', 101),
 ('I.2.4', 97),
 ('cs.CG', 94),
 ('eess.AS', 89),
 ('cs.ET', 85),
 ('eess.IV'

In [7]:
tag_to_label = {i[0]: num for num, i in enumerate(tags_frequency.most_common(50))}
label_to_tag= {value: key for key, value in tag_to_label.items()}

most_common_tags_set = set(label_to_tag.values())

In [8]:
most_common_human_readable = {
    'cs.CV': 'Computer Vision and Pattern Recognition',
    'cs.LG': 'Machine Learning',
    'cs.AI': 'Artificial Intelligence',
    'stat.ML': 'Machine Learning',
    'cs.CL': 'Computation and Language',
    'cs.NE': 'Neural and Evolutionary Computing',
    'cs.IR': 'Information Retrieval',
    'math.OC': 'Optimization and Control',
    'cs.RO': 'Robotics',
    'cs.LO': 'Logic in Computer Science',
    'cs.SI': 'Social and Information Networks',
    'cs.DS': 'Data Structures and Algorithms',
    'cs.IT': 'Information Theory',
    'math.IT': 'Information Theory',
    'q-bio.NC': 'Quantitative Biology - Neurons and Cognition',
    'stat.ME': 'Methodology',
    'cs.HC': 'Human-Computer Interaction',
    'cs.CR': 'Cryptography and Security',
    'cs.DC': 'Distributed, Parallel, and Cluster Computing',
    'cs.SD': 'Sound',
    'cs.CY': 'Computers and Society',
    'stat.AP': 'Applications',
    'cs.MM': 'Multimedia',
    'math.ST': 'Statistics Theory',
    'stat.TH': 'Statistics Theory',
    'cs.DB': 'Databases',
    'cs.GT': 'Computer Science and Game Theory',
    'I.2.7': 'Natural Language Processing',
    'physics.soc-ph': 'Physics and Society',
    'cs.CE': 'Computational Engineering, Finance, and Science',
    'cs.SY': 'Systems and Control',
    'cs.MA': 'Multiagent Systems',
    'stat.CO': 'Combinatorics',
    'cs.NA': 'Numerical Analysis',
    'q-bio.QM': 'Quantitative Biology - Quantitative Methods',
    'cs.GR': 'Graphics',
    'cs.CC': 'Computational Complexity',
    'physics.data-an': 'Data Analysis, Statistics and Probability',
    'cs.SE': 'Software Engineering',
    'math.NA': 'Numerical Analysis',
    'math.PR': 'Probability',
    'quant-ph': 'Quantum Physics',
    'cs.DL': 'Digital Libraries',
    'cs.NI': 'Networking and Internet Architecture',
    'I.2.6': 'Learning',
    'cs.PL': 'Programming Languages',
    'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
    'nlin.AO': 'Adaptation and Self-Organizing Systems',
    'cmp-lg': 'Computation and Language',
    'cs.DM': 'Discrete Mathematics',
    'I.2.4': 'Knowledge Representation and Reasoning',
    'cs.CG': 'Computational Geometry',
    'eess.AS': 'Audio and Speech Processing',
    'cs.ET': 'Emerging Technologies',
    'eess.IV': 'Image and Video Processing',
    'cond-mat.stat-mech': 'Statistical Mechanics',
    'q-bio.PE': 'Quantitative Biology - Populations and Evolution',
    'physics.med-ph': 'Medical Physics',
    'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
    '68T50': 'Algorithms',
    'cs.MS': 'Mathematical Software',
    '68T05': 'Design and Analysis of Algorithms',
    'I.2.3': 'Deduction and Theorem Proving',
    'physics.optics': 'Optics',
    'cs.AR': 'Hardware Architecture',
    'eess.SP': 'Signal Processing'}

In [9]:
id2label = {key: most_common_human_readable[value] for key, value in label_to_tag.items()}

In [10]:
papers_train, papers_val = train_test_split(papers, test_size=0.15, random_state=42)

# Creating Pytorch dataset

In [11]:
import torch
from torch.utils.data import Dataset

In [12]:
def tags_to_vector(tags):
    labels = []
    for tag in tags:
        if tag in most_common_tags_set:
            labels.append(tag_to_label[tag])
    labels = torch.tensor(labels)
    labels_vector = torch.zeros(50, dtype=torch.float)
    labels_vector[labels] = 1
    return labels_vector

In [13]:
class PaperDataset:
    def __init__(self, papers):
        self.papers = papers
    
    def __len__(self):
        return len(self.papers)
        
    def get_text(self, idx):
        paper = self.papers[idx]
        
        text = f"TITLE\n{paper.title}\n\nABSTRACT\n{paper.summary}"
        
        return text
        
    def tokenize_all(self, tokenizer):
        texts = [self.get_text(num) for num in range(len(self.papers))]
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        return self
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = tags_to_vector(self.papers[idx].tags)
        item['text'] = self.get_text(idx)
        return item

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Wi/arxiv-topics-distilbert-base-cased")

dataset_train = PaperDataset(papers_train).tokenize_all(tokenizer)
dataset_val = PaperDataset(papers_val).tokenize_all(tokenizer)

## Train model 

In [15]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("Wi/arxiv-topics-distilbert-base-cased",
                                                           num_labels=50, problem_type='multi_label_classification',
                                                           ignore_mismatched_sizes=True,
                                                           id2label=id2label)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at Wi/arxiv-topics-distilbert-base-cased and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([21, 768]) in the checkpoint and torch.Size([50, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([21]) in the checkpoint and torch.Size([50]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="arxiv-topics-distilbert-base-cased", evaluation_strategy="epoch", 
                                  num_train_epochs=4, bf16=True, per_device_train_batch_size=8)

In [23]:
from sklearn.metrics import precision_score, recall_score

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = sigmoid(logits) > 0.5
    labels = labels > 0.5
    return {'accuracy': (labels == predictions).mean(), 'precision':  (labels * predictions).sum()/predictions.sum(),
            'recall': (labels * predictions).sum()/labels.sum()}

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall
1,0.0811,0.077994,0.973363,0.769674,0.382085
2,0.0779,0.074298,0.974641,0.747806,0.457
3,0.0742,0.073029,0.974872,0.747886,0.466476
4,0.0751,0.072446,0.97508,0.758171,0.462453


TrainOutput(global_step=17428, training_loss=0.07906670381039525, metrics={'train_runtime': 404.3941, 'train_samples_per_second': 344.713, 'train_steps_per_second': 43.097, 'total_flos': 1.8481762449408e+16, 'train_loss': 0.07906670381039525, 'epoch': 4.0})

## Check output

In [35]:
item = dataset_val[857]

probs = torch.sigmoid(model(item['input_ids'].cuda()[None]).logits)[0].cpu().detach().numpy()

print(item['text'], '\n')
print(compute_metrics((model(item['input_ids'].cuda()[None]).logits.cpu().detach().numpy()[0], item['labels'].cpu().numpy())))

for tag, prob, true_prob in zip(label_to_tag.values(), probs, item['labels']):
    print(f'{prob:.2f}\t{true_prob}\t{"!!!!" if( prob > 0.1 )== 1 or true_prob == 1 else ""}\t{most_common_human_readable[tag]}')

TITLE
Robust features for facial action recognition

ABSTRACT
Automatic recognition of facial gestures is becoming increasingly important
as real world AI agents become a reality. In this paper, we present an
automated system that recognizes facial gestures by capturing local changes and
encoding the motion into a histogram of frequencies. We evaluate the proposed
method by demonstrating its effectiveness on spontaneous face action
benchmarks: the FEEDTUM dataset, the Pain dataset and the HMDB51 dataset. The
results show that, compared to known methods, the new encoding methods
significantly improve the recognition accuracy and the robustness of analysis
for a variety of applications. 

{'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0}
0.82	1.0	!!!!	Computer Vision and Pattern Recognition
0.16	0.0	!!!!	Machine Learning
0.13	0.0	!!!!	Artificial Intelligence
0.07	0.0		Machine Learning
0.13	0.0	!!!!	Computation and Language
0.03	0.0		Neural and Evolutionary Computing
0.03	0.0		Informatio

In [None]:
trainer.save_model("arxiv-topics-distilbert-base-cased_2_epochs")