In [1]:
import logging
from collections import namedtuple
from pathlib import Path

import yaml
import numpy as np
import pandas as pd

from sklearn.metrics import pairwise_distances

logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s", force=True)

PATH = Path("data")

## Load Data

In [2]:
df_keywords = pd.read_csv(PATH / "keywords.csv")
df_keywords["keywords"] = df_keywords["keywords"].apply(yaml.safe_load)

df_meta = pd.read_csv(PATH / "movies_metadata.csv")
df_meta = df_meta[df_meta["id"].str.isnumeric()]
df_meta["id"] = df_meta["id"].astype("int")

  df_meta = pd.read_csv(PATH / "movies_metadata.csv")


In [3]:
df = df_meta.merge(df_keywords, on="id", how="left")
df = df.dropna(subset="keywords").copy()
df["keywords"] = df["keywords"].apply(lambda x: " ".join(x["name"] for x in x))

In [4]:
x = df.iloc[4]
x["original_title"], x["overview"], x["keywords"]

('Father of the Bride Part II',
 "Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.",
 'baby midlife crisis confidence aging daughter mother daughter relationship pregnancy contraception gynecologist')

## Prototyping

In [5]:
SearchResult = namedtuple("SearchResult", "result,embedding")


class EmbeddingSearch:
    def __init__(self, vectors: np.ndarray, embedder: callable):
        self._vectors = vectors
        self.embedder = embedder

    @classmethod
    def from_texts(cls, inputs: list[str], embedder: callable):
        _vectors = cls._create_db(inputs, embedder)
        return cls(_vectors, embedder)

    @staticmethod
    def _create_db(inputs, embedder):
        logging.debug("creating db")
        result = []
        total = len(inputs)
        step = total // 100
        for i, text in enumerate(inputs):
            vec = embedder(text)
            result.append(vec)
            if i % step == 0:
                logging.debug("%s/%s", i, total)
        return result

    def from_pickle(self, path):
        pass

    def get_closest(self, query: str, n: int = 1000) -> list[dict]:
        query_vec = self.embedder(query)

        dist = pairwise_distances(query_vec[None, ...], self._vectors, "cosine")
        dist = dist.ravel()
        idx = np.argsort(dist)[:n]
        result = [{"id": _id, "distance": dist} for _id, dist in zip(idx, dist[idx])]

        return SearchResult(result, query_vec)

    def get_rerank(self, labeling: list[dict]):
        pass

In [6]:
import fasttext

# Load pre-trained model
model_path = "fasttextmodel/cc.en.300.bin"
model = fasttext.load_model(model_path)



In [7]:
fasttext

<module 'fasttext' from 'd:\\pytorchprojects\\filmsearcher\\venv\\Lib\\site-packages\\fasttext\\__init__.py'>

In [8]:
engine = EmbeddingSearch.from_texts(df["keywords"], model.get_sentence_vector)

2023-05-08 01:24:05,538 - DEBUG - creating db
2023-05-08 01:24:05,565 - DEBUG - 0/46482
2023-05-08 01:24:05,588 - DEBUG - 464/46482
2023-05-08 01:24:05,610 - DEBUG - 928/46482
2023-05-08 01:24:05,637 - DEBUG - 1392/46482
2023-05-08 01:24:05,660 - DEBUG - 1856/46482
2023-05-08 01:24:05,688 - DEBUG - 2320/46482
2023-05-08 01:24:05,713 - DEBUG - 2784/46482
2023-05-08 01:24:05,736 - DEBUG - 3248/46482
2023-05-08 01:24:05,763 - DEBUG - 3712/46482
2023-05-08 01:24:05,786 - DEBUG - 4176/46482
2023-05-08 01:24:05,810 - DEBUG - 4640/46482
2023-05-08 01:24:05,833 - DEBUG - 5104/46482
2023-05-08 01:24:05,854 - DEBUG - 5568/46482
2023-05-08 01:24:05,876 - DEBUG - 6032/46482
2023-05-08 01:24:05,896 - DEBUG - 6496/46482
2023-05-08 01:24:05,923 - DEBUG - 6960/46482
2023-05-08 01:24:05,946 - DEBUG - 7424/46482
2023-05-08 01:24:05,967 - DEBUG - 7888/46482
2023-05-08 01:24:05,986 - DEBUG - 8352/46482
2023-05-08 01:24:06,005 - DEBUG - 8816/46482
2023-05-08 01:24:06,023 - DEBUG - 9280/46482
2023-05-08 01:

In [9]:
result = engine.get_closest("programmer internet lifehack killer", 1000)

df.iloc[[x["id"] for x in result.result]]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
9172,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,10429,tt0159784,en,Takedown,Kevin Mitnick is quite possibly the best hacke...,...,0.0,92.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The world has a right to know.,Takedown,False,6.3,56.0,hacker internet hacking computer hacker
167,False,,20000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,10428,tt0113243,en,Hackers,"Along with his new friends, a teenager who was...",...,7563728.0,107.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"There is no right or wrong, only fun and boring",Hackers,False,6.2,406.0,female nudity hacker nudity computer virus vir...
18330,False,,90000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...",http://dragontattoo.com/,65754,tt1568346,en,The Girl with the Dragon Tattoo,This English-language adaptation of the Swedis...,...,232617430.0,158.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Evil shall with evil be expelled.,The Girl with the Dragon Tattoo,False,7.2,2479.0,rape journalist based on novel journalism hack...
43364,False,,0,[],,177979,tt0091742,de,Peng! Du bist tot!,,...,0.0,0.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,,Peng! Du bist tot!,False,0.0,0.0,hacker computer murder
9328,False,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,27324,tt0286751,ja,回路,"After one of their friends commits suicide, st...",...,0.0,118.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Pulse,False,6.1,89.0,suicide computer internet student
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,False,"{'id': 257571, 'name': 'C.H.U.D. Collection', ...",0,"[{'id': 27, 'name': 'Horror'}, {'id': 35, 'nam...",,53150,tt0097001,en,C.H.U.D. II: Bud the Chud,A military experiment to create a race of supe...,...,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,This C.H.U.D.'s for you!,C.H.U.D. II: Bud the Chud,False,3.5,25.0,monster experiment mutant mutation halloween b...
6234,False,,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...",,10720,tt0309530,en,Down with Love,"In 1962 New York City, love blossoms between a...",...,20298207.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The ultimate catch has met his match,Down with Love,False,6.1,202.0,sex journalist empowerment womanizer playboy f...
3708,False,"{'id': 166376, 'name': 'Hollow Man Collection'...",95000000,"[{'id': 28, 'name': 'Action'}, {'id': 878, 'na...",,9383,tt0164052,en,Hollow Man,"Cocky researcher, Sebastian Caine is working o...",...,190213455.0,112.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,What would you do if you knew you couldn't be ...,Hollow Man,False,5.6,645.0,killing human experimentation scientist invisi...
20458,False,"{'id': 133352, 'name': 'Resident Evil: Biohaza...",0,"[{'id': 16, 'name': 'Animation'}]",http://www.sonypictures.com/homevideo/resident...,133121,tt1753496,en,Biohazard: Damnation,U.S. federal agent Leon S. Kennedy sneaks into...,...,0.0,100.0,"[{'iso_639_1': 'ja', 'name': '日本語'}, {'iso_639...",Released,The worst evil has been unleashed.,Resident Evil: Damnation,False,6.3,186.0,dystopia blood zombie based on video game adul...
