In [None]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch
import numpy as np
from util.local_data_handler import *

### Load KB-Bert

In [None]:
LOCAL_MODEL_CKPT = "./kb-bert-base-swedish-cased"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_CKPT, local_files_only=True)
model = AutoModel.from_pretrained(LOCAL_MODEL_CKPT, local_files_only=True).to(device)

### Corpus Pre-Processing

In [None]:
# LOAD_LOCAL = None
LOAD_LOCAL = "data/json/training/ver1/dataset_1_embeds.hf"

SAVE_LOCATION = None
# SAVE_LOCATION = "data/json/training/ver1/datasets_1_embeds.hf"

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
def extract_CLS_embeds(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"CLS_embed": last_hidden_state[:,0].cpu().numpy()}

In [None]:
TRAINING_DATA_JSON = "data/json/training/ver1/dataset_1_training.json"
VALIDATION_DATA_JSON = "data/json/training/ver1/dataset_1_validation.json"

In [None]:
if LOAD_LOCAL:
    entries_embedded = load_from_disk(LOAD_LOCAL)
else:
    data_files = {
        "train": TRAINING_DATA_JSON,
        "validation": VALIDATION_DATA_JSON
    }

    entries = load_dataset("json", data_files=data_files)
    entries_encoded = entries.map(tokenize, batched=True, batch_size=None)

    entries_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    entries_embedded = entries_encoded.map(extract_CLS_embeds, batched=True)

In [None]:
if SAVE_LOCATION:
    entries_embedded.save_to_disk(SAVE_LOCATION)

In [None]:
entries_embedded.column_names

### Train Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report
import numpy as np

In [None]:
X_train = np.array(entries_embedded["train"]["CLS_embed"])
y_train = np.array(entries_embedded["train"]["label"])

X_val = np.array(entries_embedded["validation"]["CLS_embed"])
y_val = np.array(entries_embedded["validation"]["label"])

X_train.shape, X_val.shape

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# lr_clf = LogisticRegression()
clf = LogisticRegressionCV(cv=cv, max_iter=3000)
clf.fit(X_train, y_train)

In [None]:
clf.score(X_val, y_val)

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_val, y_val)

### Investigate Results

In [None]:
y_pred = clf.predict(X_val)
X_val_text = entries_embedded["validation"]["text"]

for i in range(len(y_pred)):
    if y_pred[i] != y_val[i]:
        print(f"(label={y_val[i]}, pred={y_pred[i]}) {X_val_text[i]}")

In [None]:
print(classification_report(y_val, y_pred, target_names=["Non-Person", "Person"]))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()
    
y_preds = clf.predict(X_val)
plot_confusion_matrix(y_preds, y_val, ["Non-Person", "Person"])

In [None]:
TEST_STRING = "<b>Liedbeck</b> [lid-], Per Jakob, läkare och homöopat, född i Trosa d. 16 Juni 1802, död i Stockholm d. 5 Okt. 1876, blef student i Upsala 1821, med. licentiat 1828 och med"
# TEST_LABEL = 1

test_string_encoded = tokenizer(TEST_STRING, return_tensors="pt")

inputs = {k:v.to(device) for k,v in test_string_encoded.items()}
with torch.no_grad():
    outputs = model(**inputs)

test_string_embedded = outputs.last_hidden_state[:,0]

clf.predict(test_string_embedded)

### Save Classifier Model

In [None]:
from joblib import dump

CLF_MODEL_FILENAME = "logistic_regression.joblib"

In [None]:
# dump(clf, CLF_MODEL_FILENAME)

### Tokenze+Embed the Whole Encyclopedia

In [None]:
FIRST_ED = "data/json/first_ed/first_ed.json"
FOURTH_ED = "data/json/fourth_ed/fourth_ed.json"

In [None]:
# # Load Entire Encyclopedias
# editions_data = load_dataset("json", data_files={"first_ed": FIRST_ED, "fourth_ed": FOURTH_ED})

# # Tokenize
# editions_encoded = editions_data.map(tokenize, batched=True, batch_size=None)
# # Model expects tensors as inputs: convert input_ids and attention_mask to "torch" format
# editions_encoded.set_format("torch", columns=["input_ids", "attention_mask"])
# # Extract first columns of last hidden states (CLS vectors)
# editions_embedded = editions_encoded.map(extract_CLS_embeds, batched=True)

# editions_embedded.save_to_disk("data/json/classification/encyclopedia_embeds.hf")
editions_embedded = load_from_disk("data/json/classification/encyclopedia_embeds.hf")

In [None]:
editions_embedded