In [1]:
import torch
import torch.nn as nn
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time, datetime, random, re
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from optuna.pruners import SuccessiveHalvingPruner
from optuna.samplers import TPESampler
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW
from tqdm.notebook import tqdm

from transformers import pipeline

import json
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.model_selection import train_test_split
import pickle

SEED = 15
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.cuda.amp.autocast(enabled=True)
device = torch.device("cuda")

2023-05-10 01:06:08.611215: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Tokens prep

In [14]:
fetched_data[0].primary_category

'cs.AI'

In [2]:
with open('papers_info.pickle', 'rb') as f:
    fetched_data = pickle.load(f)

tags_change = [
    (set(['quant-ph', 'physics', 'nucl-th', 'nucl-ex', 'nlin', 'math-ph', 'hep-th', 'hep-ph', 'hep-lat', 'hep-ex', 'gr-qc', 'cond-mat', 'astro-ph', 'adap-org']),
        'physics'),
    (['cmp-lg'], 'cs'),
]

tags_names = {
    'cs': 'Computer Science',
    'econ': 'Economics',
    'eess': 'Electrical Engineering and Systems Science',
    'math': 'Mathematics',
    'q-bio': 'Quantitative Biology',
    'q-fin': 'Quantitative Finance',
    'stat': 'Statistics',
    'physics': 'Physics'
}

tag_to_y = dict(zip(tags_names, range(len(tags_names))))

def tag_grouping(tag):
    # returns tag category as number / -1 if unkown
    tag = tag.split('.')[0]
    for tags, new_name in tags_change:
        if tag in tags:
            tag = new_name
            break
    return tag

MIN_PAPERS_NUM = 100

data = pd.DataFrame([{'title': paper.title,
                      'summary': paper.summary,
                      'category': tag_grouping(paper.primary_category)} for paper in fetched_data])



counts = data.category.value_counts()
too_few = counts[counts < MIN_PAPERS_NUM].index
data_trunc = data.query(f'category != -1 and category not in {list(too_few)}')
cat = pd.Categorical(data_trunc.category)
data_trunc = data_trunc.drop(columns=['category'])\
    .assign(category=cat.codes)

In [3]:
cat.categories

Index(['cs', 'math', 'physics', 'q-bio', 'stat'], dtype='object')

In [4]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

from imblearn.over_sampling import RandomOverSampler

train_data, val_data = train_test_split(data_trunc, test_size=.1, stratify=data_trunc.category)

def oversample_and_shuffle(df):
    counts = df.category.value_counts()
    num_samples = counts.loc[0] // 2
    sampling_strategy = dict(zip(range(1, len(counts)), [num_samples] * len(counts)))
    balanced, _ = RandomOverSampler(sampling_strategy=sampling_strategy).fit_resample(df, df.category)
    shuffled_indices = np.arange(len(balanced))
    np.random.shuffle(shuffled_indices)
    return balanced.iloc[shuffled_indices]

balanced_train, balanced_test = map(oversample_and_shuffle, [train_data, val_data])

from operator import itemgetter
from tqdm.notebook import tqdm

MAX_LENGTH_TITLE = 30
MAX_LENGTH_SUMM = 290

def batch_encode(tokenizer, df, return_tensors='pt', batch_size=256):
    # return list of batches (input_ids, attention_mask) and list of batches of y
    title, summ = df.title.to_list(), df.summary.to_list()
    y = df.category.values
    
    encoded = []
    ys = []
    getter = itemgetter('input_ids', 'attention_mask')
    
    for i in tqdm(range(0, len(title), batch_size)):
        batch_t = title[i:i+batch_size]
        batch_s = summ[i:i+batch_size]
        
        inputs = []
        for batch, max_length  in [(batch_t, MAX_LENGTH_TITLE), (batch_s, MAX_LENGTH_SUMM)]:
            inputs.extend(getter(tokenizer.batch_encode_plus(
                batch,
                max_length=max_length,
                padding='longest',
                truncation=True,
                return_attention_mask=True,
                return_token_type_ids=False,
                return_tensors=return_tensors
            )))
        
        encoded.append(inputs)
        ys.append(torch.nn.functional.one_hot(torch.tensor(y[i:i+batch_size]).to(int)).to(float))
    
    return encoded, ys

In [5]:
train_seq, val_seq = (batch_encode(tokenizer, df, batch_size=32) for df in [balanced_train, balanced_test])

  0%|          | 0/2929 [00:00<?, ?it/s]

  0%|          | 0/326 [00:00<?, ?it/s]

In [6]:
# import pickle

# with open('encoded.pickle', 'wb+') as f:
#     pickle.dump({'train': train_seq, 'val': val_seq}, f)

In [7]:
with open('encoded.pickle', 'rb') as f:
    pickle_obj = pickle.load(f)
train_seq, val_seq = pickle_obj['train'], pickle_obj['val']

In [8]:
from torch.utils.data import Dataset, DataLoader
import numpy as np

class MyDataset(Dataset):
    def __init__(self, seq):
        super().__init__()
        self.X, self.y = seq

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def get_prediction(logits_on_gpu):
    return logits_on_gpu.argmax(axis=1).flatten().detach().cpu().numpy()

def train(model, dataloader, optimizer, scaler, scheduler, use_title=True):
    train_total_loss = 0
    total_train_f1 = 0

    model.train()
    
    exp_mean_alpha = .8
    pbar = tqdm(dataloader)
    cur_acc = None
    for X, y_true in pbar:
        X = X[:2] if use_title else X[2:]
        X = [x.to(device) for x in X]
        y = y_true.to(device)
        
        optimizer.zero_grad()
        with autocast():
            ret = model(
                input_ids=X[0],
                attention_mask=X[1],
                labels=y
            )
            loss = ret.loss
            logits = ret.logits
            train_total_loss += loss.item()
            

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        y_pred = get_prediction(logits)
        y_true = get_prediction(y_true)
        total_train_f1 += f1_score(y_pred, y_true,
                                   average='weighted',
                                   labels=np.arange(len(cat.categories)))
        if cur_acc is None:
            cur_acc = accuracy_score(y_pred, y_true)
        else:
            cur_acc = cur_acc * exp_mean_alpha + accuracy_score(y_pred, y_true) * (1 - exp_mean_alpha)
        pbar.set_description(f"Accuracy: {cur_acc:.4f}")
        

    avg_train_loss = train_total_loss / len(dataloader)
    avg_train_f1 = total_train_f1 / len(dataloader)
    
    torch.cuda.empty_cache()
    
    return {
        'loss': avg_train_loss,
        'f1': avg_train_f1,
        'accuracy': cur_acc
    }

def validate(model, dataloader, use_title=True):
    model.eval()

    metrics = {"f1": 0, "precision": 0, "recall": 0, "loss": 0, "accuracy": 0}
    
    for X, y_true in tqdm(dataloader):
        X = X[:2] if use_title else X[2:]
        X = [x.to(device) for x in X]
        y = y_true.to(device)
                                        
                                        
        with torch.no_grad():
            ret = model(
                input_ids=X[0],
                attention_mask=X[1],
                labels=y
            )
            loss = ret.loss
            logits = ret.logits

        metrics["loss"] += loss.item()
        y_pred = get_prediction(logits)
        y_true = get_prediction(y_true)
        
        for func, metric in ((f1_score, "f1"), (precision_score, "precision"), (recall_score, "recall")):
            metrics[metric] += func(y_pred, y_true, average="weighted", labels=np.arange(len(cat.categories)))
        metrics["accuracy"] += accuracy_score(y_pred, y_true)

    for metric in metrics:
        metrics[metric] /= len(dataloader)

    return metrics

In [9]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(cat.categories)
)
model.cuda();

train_set = MyDataset(train_seq)
val_set = MyDataset(val_seq)

epochs = 15
total_steps = len(train_set) * epochs

optimizer = AdamW(model.parameters(),
                  lr=3.306430502342561e-06,
                  weight_decay=0.65
                )

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)
scaler = GradScaler()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

In [10]:
for _ in range(15):
    print(train(model, train_set, optimizer, scaler, scheduler, True))
    print(validate(model, val_set, True))

  0%|          | 0/367 [00:00<?, ?it/s]

{'loss': 0.48100709549858073, 'f1': 0.4920431593445377, 'accuracy': 0.556507879791251}


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'f1': 0.6074861778435929, 'precision': 0.6834523725509428, 'recall': 0.5647119259021537, 'loss': 0.3921090804902194, 'accuracy': 0.5647119259021537}


  0%|          | 0/367 [00:00<?, ?it/s]

{'loss': 0.3218745007060749, 'f1': 0.6990466593784981, 'accuracy': 0.7470224746133077}


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


  0%|          | 0/41 [00:00<?, ?it/s]

{'f1': 0.5942172991036576, 'precision': 0.6271263714882545, 'recall': 0.587195567679361, 'loss': 0.358150273103308, 'accuracy': 0.587195567679361}


  0%|          | 0/367 [00:00<?, ?it/s]

{'loss': 0.23489786594814352, 'f1': 0.7856764077523813, 'accuracy': 0.8023062110546897}


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


  0%|          | 0/41 [00:00<?, ?it/s]

{'f1': 0.5693373504707115, 'precision': 0.6122736314566605, 'recall': 0.5615684281842819, 'loss': 0.4018565448321144, 'accuracy': 0.5615684281842819}


  0%|          | 0/367 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [28]:
train(model, train_set, optimizer, scaler, scheduler)

validate(model, val_set)

  0%|          | 0/92 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

{'f1': 0.6501755142749056,
 'precision': 0.6787277013744901,
 'recall': 0.644060880183413,
 'loss': 0.3369613107804131,
 'accuracy': 0.644060880183413}