In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import RobertaTokenizerFast, RobertaModel, RobertaForSequenceClassification
from transformers import PretrainedConfig

In [42]:
# model = RobertaModel.from_pretrained("roberta-base")
# PretrainedConfig()
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=11)
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from datasets import load_dataset

dataset = load_dataset("ccdv/arxiv-classification")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/150M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/146M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

let's learn the names of the classes

In [6]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['math.AC', 'cs.CV', 'cs.AI', 'cs.SY', 'math.GR', 'cs.CE', 'cs.PL', 'cs.IT', 'cs.DS', 'cs.NE', 'math.ST'], id=None)}

In [7]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

As we see, classifier changed to our implementation. So we are ready to finetune the model.

### Tokenizing data
The 'text' contains a lot of unneded information such as author, publication information and etc. All we need is introduction so we can preprocess it and narrow to the ontext size of 512. However, here i'll test is it able to learn with all of this information. So the only thing applied is truncation.


In [32]:
train_len = 3000
train_data = dataset['train'].select(list(range(train_len)))
test_data = dataset['test'].select(list(range(train_len//10)))

In [33]:
def rename_column(example):
#     new_label = example['label'] % 2
    new_label = example['label'] 
    example['labels'] = new_label
    del example['label']
    return example

def tokenize_fn(text):
    tokenized =  tokenizer(text['text'], 
                   truncation = True, 
                   max_length = 512, 
                   return_tensors='pt',
                  )
    
    return tokenized

In [34]:
train_data = train_data.map(tokenize_fn, batched = True)
test_data = test_data.map(tokenize_fn, batched = True)

train_data = train_data.map(rename_column)
test_data = test_data.map(rename_column)

train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [35]:
train_data

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3000
})

### Finetune

In [51]:
# define accuracy metrics
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from datasets import load_metric

metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [52]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = '/kaggle/working/results',
    num_train_epochs=3,    
    evaluation_strategy="steps",
    eval_steps=25,
    logging_strategy="steps",
    logging_steps=25,
    logging_dir='/kaggle/working/logs',
    report_to = 'none',
)

In [53]:
# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data, 
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


'cuda'

In [54]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
25,1.1626,1.277125,0.543333
50,1.0941,1.210027,0.576667
75,0.9742,1.144572,0.7
100,0.9082,1.231181,0.506667
125,1.1323,0.917786,0.716667
150,1.0286,1.072621,0.643333
175,1.0908,0.842183,0.746667
200,1.0894,0.902441,0.733333
225,1.0034,0.945321,0.723333
250,0.8829,0.886337,0.74


TrainOutput(global_step=1125, training_loss=0.6971992331610786, metrics={'train_runtime': 721.5364, 'train_samples_per_second': 12.473, 'train_steps_per_second': 1.559, 'total_flos': 2368190850048000.0, 'train_loss': 0.6971992331610786, 'epoch': 3.0})

As we can see it still train but probably we can achive a higher results with proper preprocessing.