In [8]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pathlib import Path
import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
print("CUDA available:", torch.cuda.is_available())
print("device count:", torch.cuda.device_count())
print("device name:", torch.cuda.get_device_name(0))

CUDA available: True
device count: 1
device name: NVIDIA GeForce GTX 980M


https://www.kaggle.com/code/colinlagator/pytorch-bert-multi-label/notebook

https://discuss.huggingface.co/t/download-models-for-local-loading/1963

In [14]:
path_data_folder = Path().cwd().parent.parent / "data"
path_interim_folder = path_data_folder / "interim"
path_label_folder = path_data_folder / "processed" / "labels" / "labels_complete"

# load the labels.csv from the path_label_folder
df = pd.read_csv(path_label_folder / "labels.csv", dtype={"id": str})
df.head()

Unnamed: 0,id,pattern,token_count,update_date,label,para
0,1710.02907,"data, dataset",280,2022-04-21,0,"Experiment 2: In this set of experiments, we e..."
1,1811.11012,data,195,2022-04-21,0,This section of the technical report is focuse...
2,1811.11012,"data, dataset",70,2022-04-21,0,volunteers’ vehicles were mounted with BSM-bro...
3,1912.09582,dataset,13,2022-04-21,0,for small datasets–a case with Dutch book revi...
4,1912.09582,dataset,15,2022-04-21,1,Table 4: Sentiment Analysis accuracy scores on...


# Scratch

In [6]:
text = "So we can solve the dual comparison problem (18) using any eﬃcient SVM solver, such as libsvm (Chang & Lin 2011). We used the R interface in the kernlab package (Karatzoglou et al. 2004), and our code is available in the rankSVMcompare package on Github."

print(text.lower())

so we can solve the dual comparison problem (18) using any eﬃcient svm solver, such as libsvm (chang & lin 2011). we used the r interface in the kernlab package (karatzoglou et al. 2004), and our code is available in the ranksvmcompare package on github.


In [7]:
# tokenizer = BertTokenizer.from_pretrained('/home/tvhahn/scibert_scivocab_uncased') # hpc
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')  # local

In [5]:
tokenized_text = tokenizer.tokenize(text)
print(len(tokenized_text))
print(tokenized_text)

token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print(token_ids)

66
['so', 'we', 'can', 'solve', 'the', 'dual', 'comparison', 'problem', '(', '18', ')', 'using', 'any', '[UNK]', 'svm', 'solver', ',', 'such', 'as', 'lib', '##svm', '(', 'chang', '&', 'lin', '2011', ')', '.', 'we', 'used', 'the', 'r', 'interface', 'in', 'the', 'kern', '##lab', 'package', '(', 'kar', '##atz', '##og', '##lo', '##u', 'et', 'al', '.', '2004', ')', ',', 'and', 'our', 'code', 'is', 'available', 'in', 'the', 'ranks', '##vm', '##compare', 'package', 'on', 'gi', '##th', '##ub', '.']
[564, 185, 300, 5191, 111, 4793, 2029, 1167, 145, 1178, 546, 487, 843, 101, 11422, 14699, 422, 555, 188, 8147, 22228, 145, 1044, 894, 3158, 5228, 546, 205, 185, 501, 111, 182, 3396, 121, 111, 5092, 4253, 7526, 145, 7402, 8665, 247, 609, 30120, 365, 186, 205, 6706, 546, 422, 137, 580, 2737, 165, 1427, 121, 111, 18949, 12986, 26513, 7526, 191, 4706, 266, 284, 205]


In [6]:
encoding = tokenizer.encode_plus(
  text,
  max_length=512,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  padding='max_length',
  return_attention_mask=True,
  truncation=True,
  return_tensors='pt',  # Return PyTorch tensors
)

In [7]:
print(len(encoding['input_ids'][0]))
# encoding['input_ids'][0]

512


In [8]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['[CLS]',
 'so',
 'we',
 'can',
 'solve',
 'the',
 'dual',
 'comparison',
 'problem',
 '(',
 '18',
 ')',
 'using',
 'any',
 '[UNK]',
 'svm',
 'solver',
 ',',
 'such',
 'as',
 'lib',
 '##svm',
 '(',
 'chang',
 '&',
 'lin',
 '2011',
 ')',
 '.',
 'we',
 'used',
 'the',
 'r',
 'interface',
 'in',
 'the',
 'kern',
 '##lab',
 'package',
 '(',
 'kar',
 '##atz',
 '##og',
 '##lo',
 '##u',
 'et',
 'al',
 '.',
 '2004',
 ')',
 ',',
 'and',
 'our',
 'code',
 'is',
 'available',
 'in',
 'the',
 'ranks',
 '##vm',
 '##compare',
 'package',
 'on',
 'gi',
 '##th',
 '##ub',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '

In [7]:
from transformers import AutoModel

# for hpc (need to manually download model)
# model = AutoModel.from_pretrained('/home/tvhahn/scibert_scivocab_uncased')

# for local computer
# model = AutoModel.from_pretrained('/home/tvhahn/scibert_scivocab_uncased')

In [15]:
tokenized_text = tokenizer.tokenize(text)
print(len(tokenized_text))
print(tokenized_text)

66
['so', 'we', 'can', 'solve', 'the', 'dual', 'comparison', 'problem', '(', '18', ')', 'using', 'any', '[UNK]', 'svm', 'solver', ',', 'such', 'as', 'lib', '##svm', '(', 'chang', '&', 'lin', '2011', ')', '.', 'we', 'used', 'the', 'r', 'interface', 'in', 'the', 'kern', '##lab', 'package', '(', 'kar', '##atz', '##og', '##lo', '##u', 'et', 'al', '.', '2004', ')', ',', 'and', 'our', 'code', 'is', 'available', 'in', 'the', 'ranks', '##vm', '##compare', 'package', 'on', 'gi', '##th', '##ub', '.']


In [30]:
# remove stop words, etc
stop = stopwords.words('english')
text_tokens = word_tokenize(text)
tokens_without_sw = [word for word in text_tokens if not word in stop]

# text = text.lower()
# text = text.apply(lambda x: x.split(' '))
# text = text.apply(lambda x: [item for item in x if item not in stop])
# text = text.apply(lambda x: ' '.join(x))
# text = text.apply(lambda x: re.sub('[^A-Za-z\s]+', ' ', x))
# text = text.apply(lambda x: re.sub('\n', ' ', x))
# text = text.apply(lambda x: re.sub(r'\s+', ' ', x))
# text = text.apply(lambda x: re.sub(r'^\s', '', x))
# text = text.apply(lambda x: re.sub(r'\s$', '', x))

In [32]:
filtered_sentence = (" ").join(tokens_without_sw)
filtered_sentence

'original study , authors reported results grid-independence study justify spatial temporal grid resolutions used parametric study . compared force coefficients , profiles velocity components , profiles fluctuating kinetic energy , distances vortical structures near wake , obtained different grid resolutions . , also report results grid-independence study moving results parametric study . use domain size original study : 30c × 25c × 25c ( c chord length wing ) . root wing ( around plate undergoes rolling/pitching motion ) located center computational domain . keep spatial grid uniform ( highest resolution ) sub-area domain covers motion wing . outside area , also add extra uniform layer grid-spacing size ∆x = 0.05c , sub-domain [ −2c , 6c ] × [ −3c , 3c ] × [ −1c , 2c ] , covers near-wake region . ( opted smooth transition two uniform regions , grid-cell widths stretched constant ratio 1.1 directions , except streamwise direction behind wing used ratio 1.03 . ) finally , grid-cell widt

In [20]:
tokenized_text = tokenizer.tokenize(text)
print(len(tokenized_text))
print(tokenized_text)

545
['in', 'the', 'original', 'study', ',', 'the', 'authors', 'reported', 'the', 'results', 'of', 'a', 'grid', '-', 'independence', 'study', 'to', 'justify', 'the', 'spatial', 'and', 'temporal', 'grid', 'resolutions', 'used', 'for', 'the', 'parametric', 'study', '.', 'they', 'compared', 'force', 'coefficients', ',', 'profiles', 'of', 'the', 'velocity', 'components', ',', 'profiles', 'of', 'the', 'fluctuating', 'kinetic', 'energy', ',', 'and', 'distances', 'between', 'vor', '##tical', 'structures', 'in', 'the', 'near', 'wake', ',', 'obtained', 'with', 'different', 'grid', 'resolutions', '.', 'here', ',', 'we', 'also', 'report', 'the', 'results', 'of', 'our', 'grid', '-', 'independence', 'study', 'before', 'moving', 'on', 'to', 'the', 'results', 'of', 'the', 'parametric', 'study', '.', 'we', 'use', 'the', 'same', 'domain', 'size', 'as', 'in', 'the', 'original', 'study', ':', '30', '##c', '×', '25', '##c', '×', '25', '##c', '(', 'where', 'c', 'is', 'the', 'chord', 'length', 'of', 'the', '

In [12]:
text_tokens = tokenizer.batch_encode_plus(text, pad_to_max_length=True, max_length=512, return_tensors='pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
text_tokens['input_ids'].shape

torch.Size([2441, 512])

In [17]:
text_tokens

{'input_ids': tensor([[102, 259, 103,  ...,   0,   0,   0],
        [102, 146, 103,  ...,   0,   0,   0],
        [102, 103,   0,  ...,   0,   0,   0],
        ...,
        [102, 412, 103,  ...,   0,   0,   0],
        [102, 103,   0,  ...,   0,   0,   0],
        [102, 275, 103,  ...,   0,   0,   0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}