In [None]:
import polars as pl
import numpy as np

In [None]:
data_base_path = "../data"

## Import Original Dataset

In [None]:
movies_data = pl.read_parquet(f"{data_base_path}/movies_plots_dataset.parquet")

In [None]:
movies_data.head()

## Calculating and Adding Plots Embeddings to Dataset

In [None]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-MiniLM-L6-v2")
plot_vectors = encoder.encode(movies_data["Plot"].to_list()).tolist()

In [None]:
movies_data = movies_data.with_columns(
    pl.Series(name="vector", values=plot_vectors),
)

In [None]:
movies_data.write_parquet(f"{data_base_path}/movies_plots_dataset_embd_minilm.parquet")

## Create LanceDB Table

In [None]:
import lancedb

uri = f"{data_base_path}/movies_embeddings"
db = lancedb.connect(uri)

In [None]:
movies_table = db.create_table("movies", movies_data, exist_ok=True)

In [None]:
movies_table.create_fts_index("Title", use_tantivy=False)
movies_table.create_fts_index("Cast", use_tantivy=False)

## Query Tests

In [None]:
query = "Star Wars" 
query_vector = encoder.encode(query)

In [None]:
movies_table.search(query_vector).limit(10).select(['Title', 'Director', 'Plot']).to_list()

In [None]:
query
(
    movies_table.search(query_type="hybrid")
    .vector(query_vector)
    .text(query)
    .limit(20).select(['Title', 'Director']).to_list()
)