In [29]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
import torch
import os
import numpy as np
from local_data_handler import *

### Load KB-Bert

In [30]:
LOCAL_MODEL_CKPT = "./kb-bert-base-swedish-cased"

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_CKPT, local_files_only=True)
model = AutoModel.from_pretrained(LOCAL_MODEL_CKPT, local_files_only=True).to(device)

In [32]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

### Training Corpus Pre-Processing

In [33]:
TRAINING_DATA = "data/json/training/dataset_1_training.json"

In [34]:
entries = load_dataset("json", data_files=TRAINING_DATA, split="train")
entries_encoded = entries.map(tokenize, batched=True, batch_size=None)

In [35]:
def extract_CLS_embeds(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"CLS_embedding": last_hidden_state[:,0].cpu().numpy()}

In [36]:
entries_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
entries_embedded = entries_encoded.map(extract_CLS_embeds, batched=True)

In [37]:
entries_embedded.column_names

['text',
 'label',
 'input_ids',
 'token_type_ids',
 'attention_mask',
 'CLS_embedding']

In [38]:
SAVE_LOCALLY = True
SAVE_LOCATION = "data/json/training/dataset_1_training_embeds.hf"

if SAVE_LOCALLY:
    entries_embedded.save_to_disk(SAVE_LOCATION)
    # with open(SAVE_LOCATION, 'w', encoding="utf-8") as outfile:
    #     json_file = entries_embedded["train"].to_json()
    #     json.dump(entries_embedded["train"], outfile, ensure_ascii=False, indent=2)

Saving the dataset (1/1 shards): 100%|██████████| 400/400 [00:00<00:00, 61419.01 examples/s]


### Training the Classifier

In [40]:
import numpy as np

X_train = np.array(entries_embedded["CLS_embedding"])
y_train = np.array(entries_embedded["label"])

# X_train = np.array(entries_embedded["train"]["CLS_embedding"])
# y_train = np.array(entries_embedded["train"]["label"])

X_train.shape

(400, 768)

In [49]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

In [50]:
# TEST_STRING = "hejsan svejsan förfan"
TEST_STRING = "<b>Liedbeck</b> [lid-], Per Jakob, läkare och homöopat, född i Trosa d. 16 Juni 1802, död i Stockholm d. 5 Okt. 1876, blef student i Upsala 1821, med. licentiat 1828 och med"
TEST_LABEL = 0

In [51]:
# test_entry = {
#     "text": TEST_STRING,
#     "label": TEST_LABEL
# }

test_string_encoded = tokenizer(TEST_STRING, return_tensors="pt")

inputs = {k:v.to(device) for k,v in test_string_encoded.items()}
with torch.no_grad():
    outputs = model(**inputs)

test_string_embedded = outputs.last_hidden_state[:,0]

In [52]:
lr_clf.predict(test_string_embedded)

array([1], dtype=int64)