In [1]:
import logging
from collections import namedtuple
from pathlib import Path

import yaml
import numpy as np
import pandas as pd

from sklearn.metrics import pairwise_distances

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s", force=True)

PATH = Path("data")

## Load Data

In [2]:
df_keywords = pd.read_csv(PATH / "keywords.csv")
df_keywords["keywords"] = df_keywords["keywords"].apply(yaml.safe_load)

df_meta = pd.read_csv(PATH / "movies_metadata.csv")
df_meta = df_meta[df_meta["id"].str.isnumeric()]
df_meta["id"] = df_meta["id"].astype("int")

  df_meta = pd.read_csv(PATH / "movies_metadata.csv")


In [3]:
df = df_meta.merge(df_keywords, on="id", how="left")
df = df.dropna(subset="keywords").copy()
df["keywords"] = df["keywords"].apply(lambda x: " ".join(x["name"] for x in x))

In [4]:
x = df.iloc[4]
x["original_title"], x["overview"], x["keywords"]

('Father of the Bride Part II',
 "Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.",
 'baby midlife crisis confidence aging daughter mother daughter relationship pregnancy contraception gynecologist')

## Prototyping

In [5]:
SearchResult = namedtuple("SearchResult", "result,embedding")


class EmbeddingSearch:
    def __init__(self, vectors: np.ndarray, embedder: callable):
        self._vectors = vectors
        self.embedder = embedder

    @classmethod
    def from_texts(cls, inputs: list[str], embedder: callable):
        _vectors = cls._create_db(inputs, embedder)
        return cls(_vectors, embedder)

    @staticmethod
    def _create_db(inputs, embedder):
        logging.debug("creating db")
        result = []
        total = len(inputs)
        step = total // 100
        for i, text in enumerate(inputs):
            vec = embedder(text)
            result.append(vec)
            if i % step == 0:
                logging.debug("%s/%s", i, total)
        return result

    def from_pickle(self, path):
        pass

    def get_closest(self, query: str, n: int = 1000) -> list[dict]:
        query_vec = self.embedder(query)

        dist = pairwise_distances(query_vec[None, ...], self._vectors, "cosine")
        dist = dist.ravel()
        idx = np.argsort(dist)[:n]
        result = [{"id": _id, "distance": dist} for _id, dist in zip(idx, dist[idx])]

        return SearchResult(result, query_vec)

    def get_rerank(self, labeling: list[dict]):
        pass

In [6]:
import fasttext

# Load pre-trained model
model_path = "fasttext/cc.en.300.bin"
model = fasttext.load_model(model_path)

AttributeError: module 'fasttext' has no attribute 'load_model'

In [None]:
engine = EmbeddingSearch.from_texts(df["keywords"], model.get_sentence_vector)

In [None]:
result = engine.get_closest("programmer internet lifehack killer", 1000)

df.iloc[[x["id"] for x in result.result]]