In [1]:
from opensearchpy import OpenSearch, RequestsHttpConnection,  AWSV4SignerAuth
import boto3
import pandas as pd


In [2]:
region = 'eu-west-2'
service = 'aoss'
credentials = boto3.Session(profile_name='vrt-analytics-engineer-nonsensitive').get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

vectorstore = OpenSearch(
   hosts = [{'host': "epcavlvwitam2ivpwv4k.eu-west-2.aoss.amazonaws.com", 'port': 443}],
   http_auth = auth,
   use_ssl = True,
   verify_certs = True,
   connection_class = RequestsHttpConnection,
   pool_maxsize = 20
)

In [39]:
# Create index
index_body = {
  'settings': {
    "index.knn": True
  },
  "mappings": {
    "properties": {
      "vrtmax_catalog_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "method": {
          "engine": "faiss",
          "name": "hnsw",
          "space_type": "l2"
        }
      },
    "mediacontent_page_description_program": {"type": "text"},
    "mediacontent_page_description": {"type": "text"},
    "mediacontent_page_editorialtitle_program": {"type": "text"},
    "mediacontent_pagetitle_program": {"type": "text"},
    "mediacontent_pagetitle_season": {"type": "text"},
    "mediacontent_pagetitle": {"type": "text"},
    "offering_publication_planneduntil": {"type": "text"},
    "brand_contentbrand": {"type": "text"},
    "mediacontent_pageurl": {"type": "text"},
    "mediacontent_imageurl": {"type": "text"},
    "mediacontent_programimageurl": {"type": "text"},
    "info_to_embed": {"type": "text"}
    }
  }
}

response = vectorstore.indices.create('aoss-index', body=index_body)

In [38]:
# vectorstore.indices.delete('aoss-index')

{'acknowledged': True}

In [40]:
# embed stuff and write to opensearch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
model_name = "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers"
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs
)

In [41]:
from langchain.vectorstores import OpenSearchVectorSearch


vector = OpenSearchVectorSearch(
  embedding_function = embeddings,
  index_name = 'aoss-index',
  http_auth = auth,
  use_ssl = True,
  verify_certs = True,
  connection_class = RequestsHttpConnection,
  opensearch_url="https://epcavlvwitam2ivpwv4k.eu-west-2.aoss.amazonaws.com"
)

In [46]:
df = pd.read_csv("vrtmax_catalog.csv")
df.fillna('', inplace=True)

In [47]:
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter

loader = DataFrameLoader(df, page_content_column="info_to_embed")
catalog = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
catalog_chunks = text_splitter.split_documents(catalog)

In [49]:
vector.add_documents(documents=catalog_chunks,vector_field = "vrtmax_catalog_vector",bulk_size=len(df)+1)

['582d584b-afbc-4460-ac20-b7969eb52e0e',
 '5f871a2d-4310-44df-ae86-6f088fd667d1',
 '6cc7c6f9-4b58-4429-bb08-34387375eb01',
 '3cb7e092-35b5-4a6d-9bd6-eff724929a3a',
 '56223730-274e-4922-84fa-f13758a5e17b',
 'e5300068-edb8-489d-9a26-8c16a3bc0281',
 '6494b6b1-272f-4275-b1f8-a9572a470211',
 '4861f076-af30-4b83-8cef-0501f684f310',
 '8cda68b3-23db-499b-9c1a-d81ad2aeaf46',
 'b5f5ec93-e8ac-4eb9-81bd-673241aa55b2',
 '774b385d-47bf-49c7-b26c-07ab76648e39',
 '2d32b7fb-753c-4366-9e53-d9ee8bd68692',
 '2cc540d7-0c5c-47e9-951b-5a3ffb34f1db',
 'd8d581bc-cb90-44eb-be2c-395c478338b5',
 '60c76e05-2998-4711-b79a-3ac112c22b4a',
 '6538cdaa-b12e-492e-83e9-007fca1180e2',
 '98698b88-3a48-4fb4-93f9-ca2722556665',
 'e389bf5b-7eb7-43ab-b00e-2d4797b8bace',
 'e0ddda7d-2903-4b94-99de-928b91a0f51c',
 '5a483ae1-44e8-491f-889e-b7e1ccfd3cab',
 '644cd0fb-8926-42b4-8aad-d0e8f68766ae',
 '5fcee771-873f-47a0-b616-bc7a4b1a6c82',
 'dbf804bf-c8b5-489f-ad43-b3f195f24ef1',
 'd911b1bb-260e-4560-b0a5-1171c07611d9',
 'f3a16156-1fd0-