In [1]:
from opensearchpy import OpenSearch, RequestsHttpConnection,  AWSV4SignerAuth
import boto3
import pandas as pd


In [2]:
region = 'eu-west-2'
service = 'aoss'
credentials = boto3.Session(profile_name='vrt-analytics-engineer-nonsensitive').get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

vectorstore = OpenSearch(
   hosts = [{'host': "epcavlvwitam2ivpwv4k.eu-west-2.aoss.amazonaws.com", 'port': 443}],
   http_auth = auth,
   use_ssl = True,
   verify_certs = True,
   connection_class = RequestsHttpConnection,
   pool_maxsize = 20
)

In [39]:
# Create index
index_body = {
  'settings': {
    "index.knn": True
  },
  "mappings": {
    "properties": {
      "vrtmax_catalog_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "method": {
          "engine": "faiss",
          "name": "hnsw",
          "space_type": "l2"
        }
      },
    "mediacontent_page_description_program": {"type": "text"},
    "mediacontent_page_description": {"type": "text"},
    "mediacontent_page_editorialtitle_program": {"type": "text"},
    "mediacontent_pagetitle_program": {"type": "text"},
    "mediacontent_pagetitle_season": {"type": "text"},
    "mediacontent_pagetitle": {"type": "text"},
    "offering_publication_planneduntil": {"type": "text"},
    "brand_contentbrand": {"type": "text"},
    "mediacontent_pageurl": {"type": "text"},
    "mediacontent_imageurl": {"type": "text"},
    "mediacontent_programimageurl": {"type": "text"},
    "info_to_embed": {"type": "text"}
    }
  }
}

response = vectorstore.indices.create('aoss-index', body=index_body)

In [38]:
# vectorstore.indices.delete('aoss-index')

{'acknowledged': True}

In [40]:
# embed stuff and write to opensearch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
model_name = "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers"
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs
)

In [41]:
from langchain.vectorstores import OpenSearchVectorSearch


vector = OpenSearchVectorSearch(
  embedding_function = embeddings,
  index_name = 'aoss-index',
  http_auth = auth,
  use_ssl = True,
  verify_certs = True,
  connection_class = RequestsHttpConnection,
  opensearch_url="https://epcavlvwitam2ivpwv4k.eu-west-2.aoss.amazonaws.com"
)

In [42]:
df = pd.read_csv("vrtmax_catalog.csv")
df.fillna('', inplace=True)

In [44]:
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter

loader = DataFrameLoader(df, page_content_column="info_to_embed")
catalog = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
catalog_chunks = text_splitter.split_documents(catalog)

In [36]:
catalog_chunks

[Document(metadata={'mediacontent_page_description_program': 'Superworm is een superlange, supersterke regenworm die insecten en dieren in nood altijd te hulp schiet. Maar wie zal Superworm helpen als hij wordt gevangengenomen door een gemene tovenaarshagedis?', 'mediacontent_page_description': 'Superworm is een superlange, supersterke regenworm die insecten en dieren in nood altijd te hulp schiet. Maar wie zal Superworm helpen als hij wordt gevangengenomen door een gemene tovenaarshagedis?', 'mediacontent_page_editorialtitle_program': 'Superworm is een lange, sterke regenworm die insecten in noodhelpt', 'mediacontent_pagetitle_program': 'Superworm', 'mediacontent_pagetitle_season': '2022', 'mediacontent_pagetitle': 'Superworm', 'offering_publication_planneduntil': '2027-03-30 21:55:00.000', 'brand_contentbrand': 'ketnet', 'mediacontent_pageurl': 'https://www.vrt.be/vrtmax/a-z/superworm/2022/superworm/', 'mediacontent_imageurl': 'https://images.vrt.be/orig/2023/03/24/ca44b6d5-c9ce-11ed

In [45]:
vector.add_documents(documents=catalog_chunks,vector_field = "vrtmax_catalog_vector",bulk_size=len(df)+1)

['a36b528f-6bba-4b73-9ba5-6ca55153d8f3',
 '48d39a15-13f1-458e-97b5-890883c22b39',
 'a5ce3e28-a76b-43f2-9c96-08786dcedb54']

In [24]:
df.iloc[1].mediacontent_pagetitle_season

'Seizoen 2023'