# Vector creation
This code shows the implementation of the vector database for word, sentence and document embeddings.

In [None]:
import jsonlines
path = "path_to_kripos_case.jsonl"
with jsonlines.open(path) as reader:
    data = list(reader)

In [None]:
example = data[0]
example["pages"]

In [None]:
import re

class LovdataPreprocessor:
    def __init__(self):
        self.law_match = re.compile(r"§\s*\d+(-\d+)?")
        self.dash_match = re.compile(r"–\s*\w+\s*–")
        self.num_match = re.compile(r"\d+\.\d+(\.\d+)?")
        
    def preprocess(self, text):
        text = self.law_match.sub("", text)
        text = self.dash_match.sub("", text)
        text = re.sub(r"\s+", " ", text)
        
        return text

preprocessor = LovdataPreprocessor()

In [None]:
import spacy
nlp = spacy.load("nb_core_news_lg")

In [None]:
import sys
sys.path.append("../")
from models.sbert import load, get_centrality, similarity_search
sbert = load(model_or_path="../nb-sbert-base")

In [None]:
from tqdm import tqdm
from collections import defaultdict
from nltk.tokenize import sent_tokenize

parsed_docs = defaultdict(list)

for doc in tqdm(data):
    _id = doc["id"]
    for page, sentences in doc["pages"].items():
        sentences = [preprocessor.preprocess(s) for s in sentences if s is not None]
        # we don't trust the sentences from the OCR data:
        raw_text = " ".join(sentences)
        raw_text = re.sub(r"\s+", " ", raw_text)
        sentences = sent_tokenize(raw_text, language="norwegian")
        #page_doc = nlp(raw_text)
        #for sent_id, sent in enumerate(page_doc.sents):
        for sent_id, sent in enumerate(sentences):
            if not sent or len(sent) <= 10:
                continue
            sent = nlp(sent)
            # simply add the number of found entities and noun phrases to the sentence
            # this is to weight the matches accordingly
            # e.g. 2x weight for each entity and 1.5x weight for each noun phrase
            num_nps = len(list(sent.noun_chunks))
            num_ents = len(list(sent.ents))

            # check if sent.text has been seen before
            # append the current para_id and sent_id to the existing object
            for obj in parsed_docs[_id]:
                if obj["sent_text"] == sent.text:
                    obj["page_id"].append(page)
                    obj["sent_id"].append(sent_id)
                    break
            else:
                parsed_docs[_id].append({
                    "id": f"{_id}_{page}_{sent_id}",
                    "page_id": [page],
                    "sent_id": [sent_id],
                    "num_nps": num_nps,
                    "num_ents": num_ents,
                    "sent_text": sent.text
                })
            

In [None]:
import pandas as pd
# convert parsed_docs to dataframe
df = pd.DataFrame([obj for doc in parsed_docs.values() for obj in doc])
df.head()

In [None]:
df.shape

In [None]:
# compute sbert embeddings
all_sentences = df["sent_text"].tolist()
sbert_embeddings = sbert.encode(all_sentences, show_progress_bar=True)

In [None]:
df["sbert_768"] = sbert_embeddings.tolist()

In [None]:
df.head()

In [None]:
import os
os.makedirs("../data/vectors", exist_ok=True)
df.to_pickle("../data/vectors/dataframe.pkl")