In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (20, 20)

import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
from IPython.core.display import display, HTML

import torch
from torch import nn, optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForTokenClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

so, we know that bert works and will produce some beautiful embeddings which we can fine tune. now we need to put together the training data for the embeddings, using the bert tokeniser

# data

### cleaning pipeline

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=False)

# save and load

In [None]:
import pickle

In [None]:
token_sequences = pickle.load(open("/mnt/efs/wikipedia/token_sequences.pkl", "rb"))
target_sequences = pickle.load(open("/mnt/efs/wikipedia/target_sequences.pkl", "rb"))

# cont

In [None]:
i = np.random.randint(len(token_sequences))
output_html = ""

tokens = tokenizer.convert_ids_to_tokens(token_sequences[i])
targets = target_sequences[i]
for token, target in zip(tokens, targets):
    if target == 1:
        output_html += f"<b>{token}</b> "
    else:
        output_html += token + " "

display(HTML(output_html))

# dataset and dataloader

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, token_sequences, target_sequences):
        where_big_enough = np.where([len(seq) > 10 for seq in token_sequences])
        self.token_sequences = np.array(token_sequences)[where_big_enough]
        self.target_sequences = np.array(target_sequences)[where_big_enough]
        self.lim = 512

    def __getitem__(self, index):
        token_sequence = self.token_sequences[index]
        target_sequence = self.target_sequences[index]

        # if the sequence is too long for the model to handle,
        # grab a random chunk of acceptable length instead
        if len(token_sequence) > self.lim:
            start_ix = len(token_sequence) - np.random.choice(self.lim)
            token_sequence = token_sequence[start_ix : start_ix + self.lim]
            target_sequence = target_sequence[start_ix : start_ix + self.lim]

        tokens = torch.LongTensor(token_sequence)
        targets = torch.LongTensor(target_sequence)
        return tokens, targets

    def __len__(self):
        return len(self.token_sequences)

In [None]:
def collate_fn(batch):
    token_sequences, target_sequences = zip(*batch)
    seq_lens = torch.LongTensor([len(seq) for seq in token_sequences])
    sorted_lens, sort_indicies = seq_lens.sort(dim=0, descending=True)

    sorted_tokens = [token_sequences[i] for i in sort_indicies]
    sorted_targets = [target_sequences[i] for i in sort_indicies]

    padded_tokens = pad_sequence(
        sequences=sorted_tokens, padding_value=0, batch_first=True
    )

    padded_targets = pad_sequence(
        sequences=sorted_targets, padding_value=0, batch_first=True
    )

    tokens = torch.LongTensor(padded_tokens)
    targets = torch.LongTensor(padded_targets)
    return tokens, targets

In [None]:
train_tokens, test_tokens, train_targets, test_targets = train_test_split(
    token_sequences, target_sequences, test_size=0.20, random_state=42
)

In [None]:
train_dataset = SequenceDataset(train_tokens, train_targets)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    num_workers=5,
    shuffle=True,
    collate_fn=collate_fn,
)

In [None]:
test_dataset = SequenceDataset(test_tokens, test_targets)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=1,
    num_workers=5,
    shuffle=True,
    collate_fn=collate_fn,
)

# model

In [None]:
stacked = np.hstack(train_targets)
a, b = len(stacked) - stacked.sum(), stacked.sum()
class_weights = torch.Tensor([b, a]) / (b + a)

In [None]:
class LinkLabeller(nn.Module):
    def __init__(self):
        super(LinkLabeller, self).__init__()
        self.backbone = BertModel.from_pretrained("bert-base-cased")
        self.head = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 16),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(16, 2),
        )

    def forward(self, token_sequences):
        segments = torch.zeros_like(token_sequences)
        x, _ = self.backbone(token_sequences, segments)
        return self.head(x[-1])

In [None]:
model = LinkLabeller().to(device)

In [None]:
loss_function = nn.CrossEntropyLoss(weight=class_weights.to(device))

# train

In [None]:
losses = []
torch.backends.cudnn.benchmark = True

for param in list(model.backbone.children())[0].parameters():
    param.requires_grad = False

for module in list(list(list(model.backbone.children())[1].children())[0].children())[
    :10
]:
    for param in module.parameters():
        param.requires_grad = False

trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())
optimiser = optim.Adam(trainable_parameters, lr=0.0001)

In [None]:
def train(model, train_loader, loss_function, optimiser, n_epochs):
    model.train()
    for epoch in range(n_epochs):
        loop = tqdm(train_loader)
        for tokens, targets in loop:
            tokens = tokens.cuda(non_blocking=True)
            targets = targets.cuda(non_blocking=True)
            segments = torch.zeros_like(tokens)

            optimiser.zero_grad()
            preds = model(tokens)
            loss = loss_function(preds.permute(0, 2, 1), targets)
            loss.backward()
            optimiser.step()

            losses.append(loss.item())
            loop.set_description("Epoch {}/{}".format(epoch + 1, n_epochs))
            loop.set_postfix(loss=np.mean(losses[-100:]))

In [None]:
train(model, train_loader, loss_function, optimiser, n_epochs=3)

In [None]:
loss_data = pd.Series(losses).rolling(window=100).mean()
ax = loss_data.plot()
ax.set_ylim(0);

In [None]:
def clean(output_string):
    return (
        output_string.replace("</b> <b>", " ")
        .replace("<b>##", "<b>")
        .replace(" ##", "")
    )

In [None]:
def format_output(tokens, targets, preds):
    target_string, pred_string = "", ""

    for token_id, target, pred in zip(tokens, targets, preds):
        token = tokenizer.convert_ids_to_tokens([token_id.item()])[0]

        if target == 1:
            target_string += "<b>" + token + "</b> "
        else:
            target_string += token + " "

        if pred == 1:
            pred_string += "<b>" + token + "</b> "
        else:
            pred_string += token + " "

    output_string = (
        "PRED:<br>"
        + clean(pred_string)
        + "<br><br>TARG:<br>"
        + clean(target_string)
        + "<br><br>--------<br><br>"
    )

    return output_string

In [None]:
output = ""

with torch.no_grad():
    for i, (tokens, targets) in enumerate(test_loader):
        if i < 10:
            tokens = tokens  # .cuda()
            targets = targets  # .cuda()
            segments = torch.zeros_like(tokens)

            preds = model(tokens)
            preds = nn.LogSoftmax(dim=1)(preds[0]).argmax(dim=1)

            output += format_output(tokens[0], targets[0], preds)
        else:
            break

display(HTML(output))

# save model

In [None]:
PATH = "/mnt/efs/models/20190222_bert_link_labeller.pt"

# load model for use on cpu

In [None]:
model = LinkLabeller()
model.load_state_dict(torch.load(PATH, map_location="cpu"))

In [None]:
model.load_state_dict(torch.load(PATH, map_location="cpu"))

In [None]:
tokens, targets = next(iter(test_loader))

preds = model(tokens)
preds = nn.LogSoftmax(dim=1)(preds[0]).argmax(dim=1)
output_html = format_output(tokens[0], targets[0], preds)
display(HTML(output_html))

In [None]:
preds = model(tokens)
preds = nn.LogSoftmax(dim=1)(preds[0]).argmax(dim=1)
output_html = format_output(tokens[0], targets[0], preds)
display(HTML(output_html))

# try with new text from wellcome domain

In [None]:
text = """
Last week I attended a colloquium in Berlin, Das Erbe der Berliner Sexualwissenschaft: Eine Fachtagung sexualwissenschaftlicher Archive, commemorating the 80th anniversary of destruction of Magnus Hirschfeld‘s Institut für Sexualwissenschaft by the Nazis on 6 May 1933.

I had been asked to talk about the material we hold in the Wellcome Library relating to Hirschfeld and his legacy and the impact of continental sexual science on British sexologists. There is a small amount of material specifically relating to Hirschfeld in Archives and Manuscripts: like Havelock Ellis, he was a respondent to Dr Josef Strasser’s questionnaire on his career decisions, c. 1930, and his 3-page letter to Strasser and a pamphlet can be found in MS.7042.

There is also a group of photographs of the World League for Sexual Reform (founded by Hirschfeld) Congress in Brno, 1932 among the archives of the Family Planning Association. Charlotte Wolff worked with Hirschfeld in her younger days in Berlin, and her papers among the archives of the British Psychological Society include her research files for her 1986 biography of him, the first to be published in English. The Library also holds copies of several of his works.

I was also able to mention that we hold the papers of Hirschfeld’s important precursor, Richard von Krafft-Ebing, as well as some material on Havelock Ellis, and important early printed works of sexology, including the first edition of Krafft-Ebing’s Psychopathia Sexualis and the German, and first English, editions of Ellis and J A Symond’s Sexual Inversion (the latter is very rare since Symonds’ executor bought up the entire edition to protect the family from scandal and distress). There is also a significant amount in A&M and the Library more generally pertaining to Hirschfeld’s leading British disciple, the Australian doctor Norman Haire.

"""

text_tokens = tokenizer.tokenize(text)
tokens = torch.LongTensor([tokenizer.convert_tokens_to_ids(text_tokens)])

preds_continuous = model(tokens)
preds = nn.LogSoftmax(dim=1)(preds_continuous[0]).argmax(dim=1)
output_html = format_output(tokens[0], torch.zeros_like(tokens[0]), preds)
display(HTML(output_html))

In [None]:
preds_continuous.detach().numpy().sum(axis=2)[0]

In [None]:
pd.Series(preds_continuous.detach().numpy().sum(axis=2)[0]).plot.bar()