In [None]:
import gzip
import json
import os
import shutil
from pathlib import Path

import httpx
import spacy
from tqdm.notebook import tqdm

# download model

In [None]:
! python -m spacy download en_core_web_md

# demo embedding and similarity

In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")

In [None]:
print(apple1.similarity(apple2))
print(apple1.similarity(apple3))
print(apple2.similarity(apple3))

In [None]:
apple1.vector

- encode all the titles
- produce vectors for all of them
- train an lsh model
- 

# download catalogue

In [None]:
data_dir = Path("../data")

if not data_dir.exists():
    data_dir.mkdir()

In [None]:
url = "https://data.wellcomecollection.org/catalogue/v2/works.json.gz"
filename = Path(url).name
zipped_works_file_path = data_dir / filename
works_file_path = data_dir / zipped_works_file_path.stem

In [None]:
if not works_file_path.exists():
    if not zipped_works_file_path.exists():
        with open(zipped_works_file_path, "wb") as download_file:
            with httpx.stream("GET", url, timeout=999999) as response:
                total = int(response.headers["Content-Length"])
                with tqdm(
                    total=total,
                    unit_scale=True,
                    unit_divisor=1024,
                    unit="B",
                    desc=filename,
                ) as progress:
                    num_bytes_downloaded = response.num_bytes_downloaded
                    for chunk in response.iter_bytes():
                        download_file.write(chunk)
                        progress.update(
                            response.num_bytes_downloaded - num_bytes_downloaded
                        )
                        num_bytes_downloaded = response.num_bytes_downloaded

    with gzip.open(zipped_works_file_path, "rb") as f_in:
        with open(works_file_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

In [None]:
def load_records(path):
    with open(path) as f:
        while line := f.readline():
            yield json.loads(line)

In [None]:
generator = load_records(works_file_path)

In [None]:
record = next(iter(generator))
nlp(record["title"]).vector

In [None]:
import numpy as np

In [None]:
docs = {}
for record in tqdm(load_records(works_file_path), total=1151916):
    docs[record["id"]] = {
        "title": record["title"],
        "embedding": nlp(record["title"]).vector,
    }

In [None]:
embeddings = np.stack([doc["embedding"] for doc in docs.values()])

In [None]:
embeddings

In [None]:
embeddings.shape

# lsh model

In [None]:
import os
import pickle

from sklearn.cluster import KMeans


class LSHEncoder:
    def __init__(self, model_path=None):
        if model_path:
            with open(model_path, "rb") as f:
                self.models = pickle.load(f)
        else:
            self.models = []

    @staticmethod
    def encode_for_elasticsearch(clusters):
        return [f"{i}-{val}" for i, val in enumerate(clusters)]

    def __call__(self, feature_vectors):
        feature_groups = np.split(feature_vectors, len(self.models), axis=1)

        clusters = np.stack(
            [
                model.predict(feature_group)
                for model, feature_group in zip(self.models, feature_groups)
            ],
            axis=1,
        )

        return [LSHEncoder.encode_for_elasticsearch(c) for c in clusters]

    def train(self, feature_vectors, m, n):
        feature_groups = np.split(feature_vectors, indices_or_sections=n, axis=1)
        model_list = []
        for feature_group in tqdm(feature_groups):
            clustering_alg = KMeans(n_clusters=m, n_jobs=-1).fit(feature_group)
            model_list.append(clustering_alg)
        self.models = model_list

In [None]:
lsh = LSHEncoder()

In [None]:
lsh.train(embeddings, 20, 10)

In [None]:
lsh(embeddings[:10])