In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import datetime
import pathlib as pl
from random import randrange, shuffle, randint
import random
from collections import Counter
import itertools
import statistics

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
from torch.nn import functional as F
import torch.nn as nn
from torch.utils.data import DataLoader

import spacy
from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer, Encoding
from transformers import RobertaTokenizerFast, BertTokenizerFast, BatchEncoding

import seqeval
from seqeval import metrics

import datasets

from tqdm.auto import tqdm  # for our loading bar
import plotly.express as px

# Local modules
from masking_dataset import MaskingDataset
from ner_dataset import NerDataset


In [3]:
today = datetime.date.today().strftime("%Y%m%d")

In [4]:
data_dir = pl.Path.cwd() / "data"
model_dir = pl.Path.cwd() / "model"
plot_dir = pl.Path.cwd() / "plot"
plot_dir.mkdir(parents=True, exist_ok=True)

oscar_dir = data_dir / "oscar"
oscar_dir.mkdir(parents=True, exist_ok=True)

tokenizer_dir = model_dir / "tokenizer"
tokenizer_dir.mkdir(parents=True, exist_ok=True)

In [5]:
def flatten(l):
    return itertools.chain.from_iterable(l)

def get_line_count(file_path):
    with open(file_path, 'r') as file:
        line_count = sum(1 for _ in file)
    return line_count

In [6]:
torch.manual_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
vocab_size = 3000
batch_size = 12
block_size = 256  # Number of continues tokens (history) to use for training
beta_1 = 0.9
beta_2 = 0.999
epsilon = 1e-6
learning_rate = 1e-4
weight_decay = 0.01
warm_up = 10000
drop_out = 0.1

In [8]:
%%script echo skipping
dataset = datasets.load_dataset('oscar', 'unshuffled_deduplicated_da')

skipping


## Clean and split text into filechunks of 5000 samples per file

In [9]:
%%script echo skipping
nlp = spacy.load('da_core_news_sm')
def split_sentences(string):
    return (' '.join(str(s).strip().split()) for s in nlp(string).sents)

def remove_samples_with_invalid_chars(samples, approved_chars):
    return (split_sentences(s) for s in samples if all(char in approved_chars for char in s))

def remove_sentences_with_invalid_chars(sample, approved_chars):
    sentences = split_sentences(sample)
    valid_sentences = (sentence for sentence in sentences if len(sentence) > 5 and all(char in approved_chars for char in sentence))
    return valid_sentences

def chunkify(iterable, size):
    iterator = iter(iterable)
    for first in iterator:
        yield itertools.chain([first], itertools.islice(iterator, size - 1))

skipping


In [10]:
%%script echo skipping
approved_chars = {' ', 'e', 'r', 'n', 't', 'i', 'd', 'a', 's', 'l', 'o', 'g', 'k', 'm', 'v', 'f', 'u', 'p', 'b', 'h', '.', ',', 'å', 'æ', 'ø', 'j', 'y', 'D', 'c', '\n', 'S', '1', '0', '2', '-', 'E', 'A', 'H', 'I', 'F', 'M', 'K', 'B', 'T', 'V', 'N', 'R', 'L', 'P', ':', 'J', 'O', '3', '5', 'G', '4', ')', 'U', '9', '(', 'C', '8', '6', '7', 'x', 'w', '/', '?', '!', '"', '–', '”', 'z', 'Ø', 'W', 'é', 'Å', 'Y', '|', '…', '&', 'X', "'", ';', '’', 'q', '“', 'Æ', '<', '[', '>', ']', '%', 'Z', 'Q', '@', '+', 'ö', '´', '·', '_', 'ä', '*', '‘', 'ü', '§', 'Ã', '一', '½', '—', '$', '=', 'á', '°', '{', '}', 'à', '^', 'è', '~', 'É', 'Ö', 'â', 'ã', 'ß', 'ô', '€', 'ò', 'Ž', '`', 'Ä', 'ù', 'õ', 'Ü', '£', 'ë', '¼'}

skipping


In [11]:
%%script echo skipping

s = "Jeg havde en god tid med Hr. Tim. Han var meget omsorgsful, f.eks. købte han mine nye sko. Han er en rigtig gutterman. Jeg har ikke nogen gode ideer til aftensmad. Quinn har taget en god lur. Emil er god til at bygge med LEGO."
#ss = split_sentences(s)
#fss = str(next(ss))
rss = remove_sentences_with_invalid_chars(s, approved_chars)
chunkify(rss, 3)
list(rss)

skipping


In [12]:
%%script echo skipping

clean_samples = remove_samples_with_invalid_chars(dataset['train'][:100_000]['text'], approved_chars)

oscar_today_dir = oscar_dir / today
oscar_today_dir.mkdir(exist_ok=True)
#'</s>'.join(next(chunkfied_samples))
for i, text_data in tqdm(enumerate(clean_samples)):
    with open(oscar_today_dir / f'clean_samples_{i}.txt', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(text_data))

skipping


In [13]:
%%script echo skipping

clean_text = (remove_sentences_with_invalid_chars(sample, approved_chars) for sample in tqdm(dataset['train'][:100_000]['text']))
samples = chunkify(flatten(clean_text), 5000)

for i, text_data in tqdm(enumerate(samples)):
    with open(oscar_dir / f'clean_text_{i}.txt', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(text_data).lower())

skipping


In [14]:
oscar_20230630_dir = oscar_dir / "20230630"
paths = list((oscar_20230630_dir).glob('**/clean_samples_*.txt'))

In [None]:
from tiktoken import _educational as ttedu


str_pattern = (
    r""" ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)

t = ttedu.

In [15]:
%%script echo skipping
tokenizer = ByteLevelBPETokenizer()
d = {'<pad>': 'padding',
     '<mask>': 'mask',
     '<unk>': 'unknown',
     '<b_addr>': 'begin address',
     '<i_addr>': 'intermediate address',
     '<b_price>': 'begin price',
     '<i_price>': 'intermediate price',
     '<b_bool>': 'begin of yes/no answer to question',
     '<i_bool>': '',
     '<b_type>': 'begin property type',
     '<i_type': '',
     '<b_date>': 'begin date',
     '<i_date>': ''}
tokenizer.train(files=list(map(str, paths)), vocab_size=vocab_size, min_frequency=2,
                special_tokens=list(d.keys()))

tokenizer.save_model(str(model_dir / 'byte_level_tokenizer'))

skipping


In [16]:
%%script echo skipping
tokenizer = BertWordPieceTokenizer()
d = {'<pad>': 'padding',
     '<mask>': 'mask',
     '<unk>': 'unknown',
     '<b_addr>': 'begin address',
     '<i_addr>': 'intermediate address',
     '<b_price>': 'begin price',
     '<i_price>': 'intermediate price',
     '<b_bool>': 'begin of yes/no answer to question',
     '<i_bool>': '',
     '<b_type>': 'begin property type',
     '<i_type': '',
     '<b_date>': 'begin date',
     '<i_date>': ''}
tokenizer.train(files=list(map(str, paths)), vocab_size=vocab_size, min_frequency=2,
                special_tokens=list(d.keys()))

bert_tokenizer_path = model_dir / 'bert_tokenizer'
bert_tokenizer_path.mkdir(parents=True, exist_ok=True)
tokenizer.save_model(str(bert_tokenizer_path))

skipping


In [17]:
roberta = RobertaTokenizerFast.from_pretrained('model/byte_level_tokenizer')
#bert = BertTokenizerFast.from_pretrained('model/bert_tokenizer')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
%%script echo skipping

with open(oscar_dir / "clean_text_1.txt", "r") as file_path:
    lsamples = file_path.readlines()

with open(oscar_dir / "clean_text_2.txt", "r") as file_path:
    lsamples2 = file_path.readlines()

with open(oscar_dir / "clean_samples_0.txt", "r") as file_path:
    lsamples3 = file_path.readlines()

lsamples = lsamples + lsamples2 + lsamples3

first_sample = lsamples[0]
first_sentences = [split_sentences(s) for s in lsamples]
first_sentences = flatten(first_sentences)
counter = Counter(len(t) for t in first_sentences)
print(counter)
print("Minimum length:", min(counter))
print("Maximum length:", max(counter))
avg_all = flatten([[k]*v for k,v in counter.items()])
print("Average length:", sum(avg_all) / len(avg_all))
print("Total count:", sum(counter.values()))

df = pd.DataFrame.from_dict(dict(counter), orient='index').reset_index()
df = df.rename(columns={"index": "sentence_len", 0: "count"})

skipping


In [19]:
test_strs = ["Jeg havde en god tid med Hr. Tim. Han var meget omsorgsful, f.eks. købte han mine nye sko.", "Han er en rigtig gutterman.", "Jeg har ikke nogen gode ideer til aftensmad.","Quinn har taget en god lur.", "Emil er god til at bygge med LEGO."]

In [20]:
samples = []
for file_path in paths:
    with open(file_path, 'r') as f:
        lines = [line.rstrip() for line in f]
        samples.append(lines)

In [21]:
training_data, test_data = train_test_split(samples, test_size=0.20, random_state=42)

In [22]:
training_dataset = MaskingDataset(training_data, roberta, block_size)
validation_dataset = MaskingDataset(test_data, roberta, block_size)

train_dataloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)
training_data = iter(train_dataloader)

In [None]:
for i in tqdm(range(len(training_dataset))):
    sample, mask, attention_mask = training_dataset[i]

In [48]:

ner_samples = [["Hej, mit navn er Tim.", "Jeg bor i Vejle."],["Jeg hedder Emil.", "Jeg bor i min fars og mors hus.", "Det ligger ved Søndermarken."]]
ner_labels = [[[0, 0, 0, 0, 1],[0, 0, 0, 2]], [[0,0,1],[0,0,0,0,0,0,0,0], [0,0,0,2]]]  # 1 - name, 2 - location
ner_training_data, ner_test_data, ner_training_labels, ner_test_labels = train_test_split(ner_samples, ner_labels, test_size=0.20, random_state=42) # Insert NER samples here

ner_training_dataset = NerDataset(ner_training_data, ner_training_labels, roberta, block_size)
ner_validation_dataset = NerDataset(ner_test_data, ner_test_labels, roberta, block_size)

ner_train_dataloader = DataLoader(ner_training_dataset, batch_size=2, shuffle=True)
ner_validation_dataloader = DataLoader(ner_validation_dataset, batch_size=2, shuffle=True)
ner_sample, ner_label, ner_attention_mask = next(iter(ner_train_dataloader))

In [47]:
roberta.decode(ner_sample[1])

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [20]:
def subword_level_alignment(offset_mapping):
    token_idx = []
    count = 0
    prev = 0
    for om in offset_mapping:
        if om == (0,0):
            token_idx.append(-1)
            continue
        start, nprev = om
        # There is a space between prev and current token, if there is an offset of 1
        count += int(start == prev+1)
        token_idx.append(count)
        prev = nprev
    return token_idx, count+1

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    """
    Function to tokenize and align labels with respect to the tokens. This function is specifically designed for
    Named Entity Recognition (NER) tasks where alignment of the labels is necessary after tokenization.

    Parameters:
    examples (dict): A dictionary containing the tokens and the corresponding NER tags.
                     - "tokens": list of words in a sentence.
                     - "ner_tags": list of corresponding entity tags for each word.

    label_all_tokens (bool): A flag to indicate whether all tokens should have labels.
                             If False, only the first token of a word will have a label,
                             the other tokens (subwords) corresponding to the same word will be assigned -100.

    Returns:
    tokenized_inputs (dict): A dictionary containing the tokenized inputs and the corresponding labels aligned with the tokens.
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
def compute_metrics(eval_preds):
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    results = metrics.performance_measure(true_labels, predictions)

    results = metrics.compute(predictions=predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Head

In [None]:
class Head(nn.Module):
    def __init__(self, head_size, n_embed, dropout):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)  # Stores "what I am/have"
        self.query = nn.Linear(n_embed, head_size, bias=False)  # Stores "what am I looking for/interested in"
        self.value = nn.Linear(n_embed, head_size, bias=False)  # Stores "If you find me interesting, here is what I will communicate to you"
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attention_mask = None):
        B, C = x.shape # batch, n_embed,
        query = self.query(x)
        key = self.key(x)
        # Self attention weights
        wei = query @ key.transpose(-2,-1) * C ** -0.5
        wei = wei.masked_fill_(attention_mask, float('-inf'))
        attn = F.softmax(wei, dim=-1)
        attn = self.dropout(attn)

        # Perform the weighted aggregation of values
        value = self.value(x)
        context = attn @ value
        return context

## MultiHead

In [None]:
class MultiHead(nn.Module):

    def __init__(self, num_heads, head_size, n_embed, dropout):
        super().__init__()
        self.heads = nn.ModuleList((Head(head_size, n_embed, dropout) for _ in range(num_heads)))
        self.proj = nn.Linear(n_embed, n_embed)
        #self.norm = nn.LayerNorm(n_embed, n_embed) <- Norm is just a part of Block instead
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.proj(x)
        x = self.dropout(x)
        return x

In [None]:
class FeedForward(nn.Module):

    """
    A simple layer followed by a non-linearity
    """
    def __init__(self, n_embed, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.Dropout(dropout),
            nn.GELU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):

    """
    Intersperse communication and computation
    """
    def __init__(self, num_heads, n_embed, dropout):
        super().__init__()
        head_size = n_embed // num_heads
        self.ln1 = nn.LayerNorm(n_embed)
        self.sa_heads = MultiHead(num_heads, head_size, n_embed, dropout)
        self.ln2 = nn.LayerNorm(n_embed)
        self.ffwd = FeedForward(n_embed, dropout)

    def forward(self, x):
        x = self.ln1(x)
        x = x + self.sa_heads(x)
        x = self.ln2(x)
        x = x + self.ffwd(x) # (B, T, C) - each token thinks individually
        return x

In [None]:
class Encoder(nn.Module):
    def __init__(self, num_layers, num_heads, block_size, vocab_size, n_embed, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        self.position_embedding = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*((Block(num_heads, n_embed, dropout)) for _ in range(num_layers)))
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, x, y):
        B, T = x.shape
        token_embed = self.token_embedding(x) # (batch_size, time_size, C)
        pos_embed = self.position_embedding(torch.arange(T, device=x.device)) # (time_size, C)
        x = token_embed + pos_embed # tensors gets batch aligned, so pos_embed: (batch_suze, time_size, C)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (batch_size, time_size, vocab_size)

        if y is None:
            return logits, None

        batch_size, block_size, vocab_size = logits.shape
        # Change layout to use cross_entropy (as it expects (batch, channel))
        logits = logits.view(batch_size * block_size, vocab_size) # Flatten two first dims
        y = y.view(batch_size*block_size) # Flatten
        loss = F.cross_entropy(logits, y)
        return logits, loss

In [None]:
import torch
import torch.nn as nn
from torch.nn import MultiheadAttention

def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

class Embedding(nn.Module):
    #def __init__(self, num_layers, num_heads, block_size, vocab_size, n_embed, dropout):
    def __init__(self, block_size, vocab_size, n_segments, n_embed):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embed)
        #self.position_embedding = nn.Embedding(block_size, n_embed)
        self.segment_embedding = nn.Embedding(n_segments, n_embed)  # segment(token type) embedding
        self.norm = nn.LayerNorm(n_embed)

    def forward(self, input_tensor):
        sentence_size = input_tensor.size(-1)
        pos_tensor = self.attention_position(self.size, input_tensor)

        segment_tensor = torch.zeros_like(input_tensor).to(device)
        segment_tensor[:, sentence_size // 2 + 1:] = 1

        token_embed = self.token_embedding(input_tensor)
        segment_embed = self.segment_embedding(segment_tensor)
        x = token_embed + segment_embed + pos_tensor
        x = self.norm(x)
        return x

    """
    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)
    """
    def attention_position(self, dim, input_tensor):
        B, T, S = input_tensor.shape
        sentence_size = input_tensor.size(-1)

        pos = torch.arange(sentence_size, dtype=torch.long, device=device)
        d = torch.arange(dim, dtype=torch.long, device=device)
        d = (2 * d / dim)

        pos = pos.unsqueeze(1)
        pos = pos / (1e4 ** d)

        pos[:, ::2] = torch.sin(pos[:, ::2])
        pos[:, 1::2] = torch.cos(pos[:, 1::2])

        return pos.expand(B, *pos.size())

    def numeric_position(self, dim, input_tensor):
        pos_tensor = torch.arange(dim, dtype=torch.long).to(device)
        return pos_tensor.expand_as(input_tensor)


class EncoderLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn


class MultiHeadAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)

    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = F.scaled_dot_product_attention(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)


        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]