In [31]:
import torch
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pathlib import Path
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch import nn, optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from torchmetrics import PrecisionRecallCurve
from transformers import AutoModel
import pickle

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
print("CUDA available:", torch.cuda.is_available())
print("device count:", torch.cuda.device_count())
print("device name:", torch.cuda.get_device_name(0))

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

CUDA available: True
device count: 1
device name: NVIDIA GeForce GTX 980M


device(type='cuda', index=0)

https://www.kaggle.com/code/colinlagator/pytorch-bert-multi-label/notebook

https://discuss.huggingface.co/t/download-models-for-local-loading/1963

Much of notebook is from: https://colab.research.google.com/drive/1PHv-IRLPCtv7oTcIGbsgZHqrB5LPvB7S#scrollTo=-FWG7kBm372V

In [3]:
# data loaders, etc
class ArxivDataset(Dataset):

  def __init__(self, texts, labels, tokenizer, max_len):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.texts)
  
  def __getitem__(self, item):
    text_orig = str(self.texts[item])
    text = str(self.texts[item]).lower()
    label = self.labels[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True, # Add '[CLS]' and '[SEP]'
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt', # Return PyTorch tensors
    )

    return {
      'texts': text,
      'texts_orig': text_orig,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_masks': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label, dtype=torch.long)
    }

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = ArxivDataset(
    texts=df.para.to_numpy(),
    labels=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )


In [None]:
# load tokenizer
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

# for hpc (need to manually download model)
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

# send model to gpu
model.to(DEVICE)

In [None]:
path_data_folder = Path().cwd().parent.parent / "data"
path_interim_folder = path_data_folder / "interim"
path_label_folder = path_data_folder / "processed" / "labels" / "labels_complete"

# load the labels.csv from the path_label_folder
df = pd.read_csv(path_label_folder / "labels.csv", dtype={"id": str})
df["para"] = df["para"].str.lower()
df["label"] = df["label"].apply(lambda x: 1 if x > 0 else 0) # binary labels
print(df.shape)
print(df["label"].unique())


# loop
train_data_loader = create_data_loader(df, tokenizer, 512, 20)

dfh_list = []
for i, data in enumerate(train_data_loader):

  labels = data['labels']
  with torch.no_grad():
    last_hidden_states = model(data["input_ids"].to(DEVICE), attention_mask=data["attention_masks"].to(DEVICE))
    features = last_hidden_states[0][:,0,:].cpu().numpy() # from https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/
    df_h = pd.DataFrame(labels, columns=["label"])
    df_h["para"] = data["texts_orig"]
    df_h["h"] = features.tolist()
    df_h['h'] = df_h['h'].apply(lambda x: np.array(x))
    dfh_list.append(df_h)
  
  if i % 5 == 0:
    print(i*20)

dfh = pd.concat(dfh_list)
# save dfh as a pickle file
with open(path_interim_folder / "dfh.pkl", "wb") as f:
  pickle.dump(dfh, f)




In [None]:
# load dfh.pickle
with open(path_interim_folder / "dfh.pkl", "rb") as f:
  dfh = pickle.load(f)
dfh.head()

## Load dfh
Load dfh in pandas.

In [35]:
path_data_dir = Path().cwd().parent.parent / "data"
embeddings_dir = path_data_dir / "processed/embeddings"

# load dfh.pickle
with open(embeddings_dir / "df_embeddings.pkl", "rb") as f:
    df = pickle.load(f)

df.head()

Unnamed: 0,label,id,para,h
0,0,1710.02907,"experiment 2: in this set of experiments, we e...","[-0.7371358871459961, -1.4070982933044434, -0...."
1,0,1811.11012,this section of the technical report is focuse...,"[-0.3564741313457489, 0.018136806786060333, -0..."
2,0,1811.11012,volunteers’ vehicles were mounted with bsm-bro...,"[-0.7548128366470337, -0.35174882411956787, -0..."
3,0,1912.09582,for small datasets–a case with dutch book revi...,"[-1.4487942457199097, -0.013197386637330055, 0..."
4,1,1912.09582,table 4: sentiment analysis accuracy scores on...,"[-0.8141533136367798, 0.016403447836637497, -0..."


In [39]:
df[df["label"] == 1].iloc[9]["para"]

'the case study used for testing the methodology was chosen from the commercial reference buildings database [26] of the us department of energy (doe). a secondary school located in san francisco (california) and constructed after the year of 1980 was selected. data about the energy load demands (whose hourly values are shown in figure 4) were calculated by means of energyplus simulation software [27] and then imported and processed in matlab. hourly temperatures of the typical meteorological year of san francisco, which are shown in figure 5, were considered.'

In [30]:
x = np.array([i for i in dfh["h"].values])
x.dtype

dtype('float64')

In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [23]:
x = np.array([i for i in dfh["h"].values])
y = dfh['label'].values

# split into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [25]:
gnb_clf = GaussianNB()
gnb_clf.fit(x_train, y_train)

In [26]:
gnb_clf.score(x_test, y_test)

0.8316326530612245

In [24]:
lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# train a naive bayes classifier
gnb = GaussianNB()
y_train = dfh["label"].values

gnb.fit(x_train, y_train)


In [None]:
# predict
x_test = np.array([i for i in dfh["h"].values])
x_test.shape



In [7]:
df_embeddings = pd.read_pickle(embeddings_dir / "dfh.pkl")
df_embeddings.head()

Unnamed: 0,label,para,h
0,0,"experiment 2: in this set of experiments, we e...","[-0.7371358871459961, -1.4070982933044434, -0...."
1,0,this section of the technical report is focuse...,"[-0.3564741313457489, 0.018136806786060333, -0..."
2,0,volunteers’ vehicles were mounted with bsm-bro...,"[-0.7548128366470337, -0.35174882411956787, -0..."
3,0,for small datasets–a case with dutch book revi...,"[-1.4487942457199097, -0.013197386637330055, 0..."
4,1,table 4: sentiment analysis accuracy scores on...,"[-0.8141533136367798, 0.016403447836637497, -0..."


In [13]:
type(df_embeddings["h"].values[0][0])

numpy.float64

In [None]:
l = np.array([i for i in a])
l.shape

In [None]:
l = []
for i in a:
    l.append(int(i.shape[0]))

# only keep unique values in l
l = list(set(l))
l

In [None]:
a = np.reshape(a, (-1, a.shape[0]))
a.shape

In [None]:
a = a.reshape(-1, a.shape[-1])
a.shape

In [None]:
a[0].shape

In [None]:
dfh["h"].to_numpy()[0].shape

In [None]:
a = df_h['h'].to_numpy()

In [None]:
np.min(a[0])

# Test creation of embeddings

In [None]:
path_data_folder = Path().cwd().parent.parent / "data"
path_interim_folder = path_data_folder / "interim"
path_label_folder = path_data_folder / "processed" / "labels" / "labels_complete"

# load the labels.csv from the path_label_folder
df = pd.read_csv(path_label_folder / "labels.csv", dtype={"id": str})
# lowercase "para" column in df
df["para"] = df["para"].str.lower()


df["label"] = df["label"].apply(lambda x: 1 if x > 0 else 0) # binary labels
# df = df.drop(columns=["label"])
print(df.shape)
print(df["label"].unique())
df.head()

In [None]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

In [None]:
class ArxivDataset(Dataset):

  def __init__(self, texts, labels, tokenizer, max_len):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.texts)
  
  def __getitem__(self, item):
    text = str(self.texts[item]).lower()
    text_orig = str(self.texts[item])
    label = self.labels[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True, # Add '[CLS]' and '[SEP]'
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt', # Return PyTorch tensors
    )

    return {
      'texts': text,
      'texts_orig': text_orig,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_masks': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label, dtype=torch.long)
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = ArxivDataset(
    texts=df.para.to_numpy(),
    labels=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
train_data_loader = create_data_loader(df, tokenizer, 512, 2)

data = next(iter(train_data_loader))
data.keys()

In [None]:
data['input_ids'].shape

In [None]:
# for hpc (need to manually download model)
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

# send model to gpu
model.to(DEVICE)

In [None]:
# put h into a dataframe with the labels
df_h = pd.DataFrame(labels, columns=["label"])
df_h["para"] = data["texts_orig"]
df_h["h"] = h.tolist()
df_h['h'] = df_h['h'].apply(lambda x: np.array(x))

In [None]:
df_h

### Put it all together


In [None]:
path_data_folder = Path().cwd().parent.parent / "data"
path_interim_folder = path_data_folder / "interim"
path_label_folder = path_data_folder / "processed" / "labels" / "labels_complete"

# load the labels.csv from the path_label_folder
df = pd.read_csv(path_label_folder / "labels.csv", dtype={"id": str})
# lowercase "para" column in df
df["para"] = df["para"].str.lower()

df["label"] = df["label"].apply(lambda x: 1 if x > 0 else 0) # binary labels
# df = df.drop(columns=["label"])
print(df.shape)
print(df["label"].unique())

In [None]:
df["para"].values

In [None]:
tokens = df["para"].apply((lambda x: tokenizer.encode(
    x,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    max_length=512,
    # return_token_type_ids=False,
    padding='max_length',
    return_attention_mask=True,
    truncation=True,
    )))


In [None]:
from transformers import AutoModel

# for hpc (need to manually download model)
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')


In [None]:
MAX_LEN = 512
BATCH_SIZE = 2

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
# create data loader -- inspired by https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
class ArxivDataset(Dataset):

  def __init__(self, texts, labels, tokenizer, max_len):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.texts)
  
  def __getitem__(self, item):
    text = str(self.texts[item]).lower()
    label = self.labels[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True, # Add '[CLS]' and '[SEP]'
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt', # Return PyTorch tensors
    )

    return {
      'texts': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label, dtype=torch.long)
    }

In [None]:
df_train, df_val = train_test_split(df, test_size=0.1, random_state=12)
df_train.head()

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = ArxivDataset(
    texts=df.para.to_numpy(),
    labels=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
# load the tokenizer
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')  # local

In [None]:
MAX_LEN = 512
BATCH_SIZE = 2

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['labels'].shape)

In [None]:
data

## Build Model

In [None]:
PRE_TRAINED_MODEL_NAME = 'allenai/scibert_scivocab_uncased'

MAX_LEN = 512
BATCH_SIZE = 4

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
class ArxivClassifier(nn.Module):
    def __init__(self, n_classes, pre_trained_model_name):
        super(ArxivClassifier, self).__init__()
        self.encoder = BertModel.from_pretrained(pre_trained_model_name)

        self.dense_1 = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.dense_2 = torch.nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask):
        output_1 = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooled_output = hidden_state[:, 0]
        pooled_output = self.dense_1(pooled_output)
        pooled_output = torch.nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        output = self.dense_2(pooled_output)
        return output

In [None]:
model = ArxivClassifier(4)
model = model.to(DEVICE)

In [None]:
# load best_model_state.bin pytorch model
model.load_state_dict(torch.load("best_model_state.bin"))
model = model.to(DEVICE)

In [None]:
data = next(iter(train_data_loader))
input_ids = data['input_ids'].to(DEVICE)
attention_mask = data['attention_mask'].to(DEVICE)

In [None]:
pred = F.softmax(model(input_ids, attention_mask), dim=1)
labels = data['labels']

In [None]:
# try pr-auc curve with torchmetrics
# https://torchmetrics.readthedocs.io/en/v0.8.2/classification/precision_recall_curve.html
# https://torchmetrics.readthedocs.io/en/v0.8.2/classification/binned_precision_recall_curve.html

pr_curve = PrecisionRecallCurve(num_classes=4)
precision, recall, thresholds = pr_curve(pred.cpu(), labels)

In [None]:
recall

In [None]:
EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(DEVICE)

In [None]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    labels = d["labels"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)

    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["labels"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, labels)

      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    DEVICE, 
    scheduler, 
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    DEVICE, 
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

In [None]:
torch.tensor(history['train_acc'])

In [None]:
plt.plot(torch.tensor(history['train_acc']), label='train accuracy')
plt.plot(torch.tensor(history['val_acc']), label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

# Scratch

In [None]:
text = "So we can solve the dual comparison problem (18) using any eﬃcient SVM solver, such as libsvm (Chang & Lin 2011). We used the R interface in the kernlab package (Karatzoglou et al. 2004), and our code is available in the rankSVMcompare package on Github."

print(text.lower())

In [None]:
# tokenizer = BertTokenizer.from_pretrained('/home/tvhahn/scibert_scivocab_uncased') # hpc
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')  # local

In [None]:
tokenized_text = tokenizer.tokenize(text)
print(len(tokenized_text))
print(tokenized_text)

token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
print(token_ids)

In [None]:
encoding = tokenizer.encode_plus(
  text,
  max_length=512,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  padding='max_length',
  return_attention_mask=True,
  truncation=True,
  return_tensors='pt',  # Return PyTorch tensors
)

In [None]:
print(len(encoding['input_ids'][0]))
# encoding['input_ids'][0]

In [None]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [None]:
from transformers import AutoModel

# for hpc (need to manually download model)
model = AutoModel.from_pretrained('/home/tvhahn/scibert_scivocab_uncased')

# for local computer
# model = AutoModel.from_pretrained('/home/tvhahn/scibert_scivocab_uncased')

In [None]:
tokenized_text = tokenizer.tokenize(text)
print(len(tokenized_text))
print(tokenized_text)

In [None]:
# remove stop words, etc
stop = stopwords.words('english')
text_tokens = word_tokenize(text)
tokens_without_sw = [word for word in text_tokens if not word in stop]

# text = text.lower()
# text = text.apply(lambda x: x.split(' '))
# text = text.apply(lambda x: [item for item in x if item not in stop])
# text = text.apply(lambda x: ' '.join(x))
# text = text.apply(lambda x: re.sub('[^A-Za-z\s]+', ' ', x))
# text = text.apply(lambda x: re.sub('\n', ' ', x))
# text = text.apply(lambda x: re.sub(r'\s+', ' ', x))
# text = text.apply(lambda x: re.sub(r'^\s', '', x))
# text = text.apply(lambda x: re.sub(r'\s$', '', x))

In [None]:
filtered_sentence = (" ").join(tokens_without_sw)
filtered_sentence

In [None]:
tokenized_text = tokenizer.tokenize(text)
print(len(tokenized_text))
print(tokenized_text)

In [None]:
text_tokens = tokenizer.batch_encode_plus(text, pad_to_max_length=True, max_length=512, return_tensors='pt')

In [None]:
text_tokens['input_ids'].shape

In [None]:
text_tokens