In [35]:
import torch
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pathlib import Path
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch import nn, optim
import torch.nn.functional as F
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
print("CUDA available:", torch.cuda.is_available())
print("device count:", torch.cuda.device_count())
print("device name:", torch.cuda.get_device_name(0))

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

CUDA available: True
device count: 1
device name: NVIDIA GeForce GTX 980M


device(type='cuda', index=0)

https://www.kaggle.com/code/colinlagator/pytorch-bert-multi-label/notebook

https://discuss.huggingface.co/t/download-models-for-local-loading/1963

In [3]:
path_data_folder = Path().cwd().parent.parent / "data"
path_interim_folder = path_data_folder / "interim"
path_label_folder = path_data_folder / "processed" / "labels" / "labels_complete"

# load the labels.csv from the path_label_folder
df = pd.read_csv(path_label_folder / "labels.csv", dtype={"id": str})
df.head()

Unnamed: 0,id,pattern,token_count,update_date,label,para
0,1710.02907,"data, dataset",280,2022-04-21,0,"Experiment 2: In this set of experiments, we e..."
1,1811.11012,data,195,2022-04-21,0,This section of the technical report is focuse...
2,1811.11012,"data, dataset",70,2022-04-21,0,volunteers’ vehicles were mounted with BSM-bro...
3,1912.09582,dataset,13,2022-04-21,0,for small datasets–a case with Dutch book revi...
4,1912.09582,dataset,15,2022-04-21,1,Table 4: Sentiment Analysis accuracy scores on...


In [27]:
# create data loader -- inspired by https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
class ArxivDataset(Dataset):

  def __init__(self, texts, labels, tokenizer, max_len):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.texts)
  
  def __getitem__(self, item):
    text = str(self.texts[item]).lower()
    label = self.labels[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True, # Add '[CLS]' and '[SEP]'
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt', # Return PyTorch tensors
    )

    return {
      'texts': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_masks': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label, dtype=torch.long)
    }

In [28]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=12)
df_train.head()

Unnamed: 0,id,pattern,token_count,update_date,label,para
99,2011.05411,dataset,89,2022-04-21,0,2. Local Computation: Once receiving the globa...
85,2011.00242,python,2,2022-04-21,0,PHP Python
31,2108.02756,dataset,140,2022-04-21,0,Inspired by the defense mechanism presented in...
14,2101.00522,dataset,104,2022-04-21,0,Table 3: The percentage of shift in pixel labe...
21,2104.09994,"publicly available, dataset",7,2022-04-21,0,Table 1 Public IoT network datasets.


In [29]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = ArxivDataset(
    texts=df.para.to_numpy(),
    labels=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [30]:
# load the tokenizer
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')  # local

In [31]:
MAX_LEN = 512
BATCH_SIZE = 2

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)

In [32]:
data = next(iter(train_data_loader))
data.keys()

dict_keys(['texts', 'input_ids', 'attention_masks', 'labels'])

In [33]:
print(data['input_ids'].shape)
print(data['attention_masks'].shape)
print(data['labels'].shape)

torch.Size([2, 512])
torch.Size([2, 512])
torch.Size([2])


In [34]:
data

{'texts': ['2. local computation: once receiving the global ml model from the server, the participants updates its current local ml model and then trains the updated model using the local dataset resided in the device. this step is operated at local nodes, and it requires end-users’ devices to install an fl client program to perform training algorithms such as federatedsgd and federated averaging, as well as to receive the global model updates and send the local ml model parameters from/to the server.',
  'php python'],
 'input_ids': tensor([[  102,   170,   205,  ...,     0,     0,     0],
         [  102,   375, 30121,  ...,     0,     0,     0]]),
 'attention_masks': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 0])}

## Build Model

In [47]:
PRE_TRAINED_MODEL_NAME = 'allenai/scibert_scivocab_uncased'

In [52]:
class ArxivClassifier(nn.Module):
    def __init__(self, n_classes):
        super(ArxivClassifier, self).__init__()
        self.encoder = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

        self.dense_1 = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.dense_2 = torch.nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_masks):
        output_1 = self.encoder(input_ids=input_ids, attention_mask=attention_masks)
        hidden_state = output_1[0]
        pooled_output = hidden_state[:, 0]
        pooled_output = self.dense_1(pooled_output)
        pooled_output = torch.nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        output = self.dense_2(pooled_output)
        return output

In [53]:
model = ArxivClassifier(4)
model = model.to(DEVICE)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [57]:
input_ids = data['input_ids'].to(DEVICE)
attention_masks = data['attention_masks'].to(DEVICE)

In [58]:
F.softmax(model(input_ids, attention_masks), dim=1)

tensor([[0.2160, 0.1752, 0.2717, 0.3371],
        [0.1596, 0.1964, 0.2892, 0.3548]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)

# Scratch

In [None]:
text = "So we can solve the dual comparison problem (18) using any eﬃcient SVM solver, such as libsvm (Chang & Lin 2011). We used the R interface in the kernlab package (Karatzoglou et al. 2004), and our code is available in the rankSVMcompare package on Github."

print(text.lower())

In [None]:
# tokenizer = BertTokenizer.from_pretrained('/home/tvhahn/scibert_scivocab_uncased') # hpc
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')  # local

In [None]:
tokenized_text = tokenizer.tokenize(text)
print(len(tokenized_text))
print(tokenized_text)

token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print(token_ids)

In [None]:
encoding = tokenizer.encode_plus(
  text,
  max_length=512,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  padding='max_length',
  return_attention_mask=True,
  truncation=True,
  return_tensors='pt',  # Return PyTorch tensors
)

In [None]:
print(len(encoding['input_ids'][0]))
# encoding['input_ids'][0]

In [None]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [None]:
from transformers import AutoModel

# for hpc (need to manually download model)
# model = AutoModel.from_pretrained('/home/tvhahn/scibert_scivocab_uncased')

# for local computer
# model = AutoModel.from_pretrained('/home/tvhahn/scibert_scivocab_uncased')

In [None]:
tokenized_text = tokenizer.tokenize(text)
print(len(tokenized_text))
print(tokenized_text)

In [None]:
# remove stop words, etc
stop = stopwords.words('english')
text_tokens = word_tokenize(text)
tokens_without_sw = [word for word in text_tokens if not word in stop]

# text = text.lower()
# text = text.apply(lambda x: x.split(' '))
# text = text.apply(lambda x: [item for item in x if item not in stop])
# text = text.apply(lambda x: ' '.join(x))
# text = text.apply(lambda x: re.sub('[^A-Za-z\s]+', ' ', x))
# text = text.apply(lambda x: re.sub('\n', ' ', x))
# text = text.apply(lambda x: re.sub(r'\s+', ' ', x))
# text = text.apply(lambda x: re.sub(r'^\s', '', x))
# text = text.apply(lambda x: re.sub(r'\s$', '', x))

In [None]:
filtered_sentence = (" ").join(tokens_without_sw)
filtered_sentence

In [None]:
tokenized_text = tokenizer.tokenize(text)
print(len(tokenized_text))
print(tokenized_text)

In [None]:
text_tokens = tokenizer.batch_encode_plus(text, pad_to_max_length=True, max_length=512, return_tensors='pt')

In [None]:
text_tokens['input_ids'].shape

In [None]:
text_tokens