In [None]:
import pandas as pd

In [None]:
# Data
df = pd.read_csv("../data/test.csv")

# Postprocessing
# Concatenate description & subtitles
df["info_to_embed"] = df["mediacontent_page_description"] + " " + df["subtitle"]
df = df[df.info_to_embed.notnull()]
# Maybe remove /n and other annoying artefacts

In [None]:
# Embed
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

df= df[["mediacontent_page_description","mediacontent_pagetitle_program","info_to_embed","mediacontent_pageid"]]

loader = DataFrameLoader(df, page_content_column="info_to_embed")
catalog = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
catalog_chunks = text_splitter.split_documents(catalog)

model_name = "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers"
encode_kwargs = {'normalize_embeddings': False}
hf_embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs
)
vector_store = FAISS.from_documents(documents=catalog_chunks,
                                    embedding=hf_embedding)

print(f"The vectore store contains {len(vector_store.docstore._dict)} documents chunks in total.")

In [None]:
vector_store.save_local('faiss_vector_store')