In [6]:
from opensearchpy import OpenSearch, RequestsHttpConnection,  AWSV4SignerAuth
import boto3
import pandas as pd

In [7]:
region = 'eu-west-2'
service = 'aoss'
credentials = boto3.Session(profile_name='vrt-analytics-engineer-nonsensitive').get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

vectorstore = OpenSearch(
   hosts = [{'host': "epcavlvwitam2ivpwv4k.eu-west-2.aoss.amazonaws.com", 'port': 443}],
   http_auth = auth,
   use_ssl = True,
   verify_certs = True,
   connection_class = RequestsHttpConnection,
   pool_maxsize = 20
)

In [3]:
credentials

<botocore.credentials.DeferredRefreshableCredentials at 0x7ac814095e70>

In [8]:
# Create index
index_body = {
  'settings': {
    "index.knn": True
  },
  "mappings": {
    "properties": {
      "vrtmax_catalog_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "method": {
          "engine": "faiss",
          "name": "hnsw",
          "space_type": "l2"
        }
      },
    "mediacontent_page_description_program": {"type": "text"},
    "mediacontent_page_description": {"type": "text"},
    "mediacontent_page_editorialtitle_program": {"type": "text"},
    "mediacontent_pagetitle_program": {"type": "text"},
    "mediacontent_pagetitle_season": {"type": "text"},
    "mediacontent_pagetitle": {"type": "text"},
    "offering_publication_planneduntil": {"type": "text"},
    "brand_contentbrand": {"type": "text"},
    "mediacontent_pageurl": {"type": "text"},
    "mediacontent_imageurl": {"type": "text"},
    "mediacontent_programimageurl": {"type": "text"},
    "mediacontent_episode_castlist": {"type": "nested"},
    "mediacontent_media_content_type": {"type": "text"},
    "mediacontent_page_editorialtags": {"type": "nested"},
    "info_to_embed": {"type": "text"}
    }
  }
}

response = vectorstore.indices.create('vrtmax-catalog-index-2', body=index_body)

RequestError: RequestError(400, 'resource_already_exists_exception', 'OpenSearch exception [type=resource_already_exists_exception, reason=index [vrtmax-catalog-index/-esv3ZEBsTPNrKd5BweK] already exists]- server : [envoy]')

In [38]:
# vectorstore.indices.delete('aoss-index')

{'acknowledged': True}

In [9]:
# embed stuff and write to opensearch
from langchain_huggingface import HuggingFaceEmbeddings
model_name = "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers"
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [13]:
from langchain.vectorstores import OpenSearchVectorSearch


vector = OpenSearchVectorSearch(
  embedding_function = embeddings,
  index_name = 'vrtmax-catalog-index-2',
  http_auth = auth,
  use_ssl = True,
  verify_certs = True,
  connection_class = RequestsHttpConnection,
  opensearch_url="https://epcavlvwitam2ivpwv4k.eu-west-2.aoss.amazonaws.com"
)

In [14]:
df = pd.read_csv("vrtmax_catalog.csv")
df.fillna('', inplace=True)

In [15]:
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter

loader = DataFrameLoader(df, page_content_column="info_to_embed")
catalog = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
catalog_chunks = text_splitter.split_documents(catalog)

In [16]:
vector.add_documents(documents=catalog_chunks,vector_field = "vrtmax_catalog_vector",bulk_size=len(df)+1)

['f368f8d3-d76a-4e83-86a4-a979196de2de',
 '7aae17e5-29d7-41c0-b76e-0dd50aee8ff5',
 'febe192a-abe5-43b6-98cb-9cdcfa415a07',
 'a731419a-2a93-4679-aee2-9e1bc7e44e5b',
 'aa686a5c-017d-4d9e-9a4e-bf99ec9f40cc',
 'c1fbaa99-7592-43b2-952a-70e419c1f917',
 'ed85d37e-ebc8-424f-9471-fb06d141e5e0',
 '523a9177-b688-4f30-a3db-375b6065dda8',
 'f80f45cf-65e8-4d81-8fa4-d2e6133b293f',
 '89384169-041d-441e-9551-661930514cba',
 'ce0f33b9-18c7-4268-9f4b-641bcb564b6c',
 'ed49f2aa-4162-4761-88ad-148f4473d772',
 '470cb606-bf93-423e-9c0f-a4827f8366e9',
 '7c88f3df-1754-4769-911c-6c9a5ce59ce5',
 'e0963e28-df3a-473f-961d-c17263e57736',
 'fc7d2aad-3ffc-41d8-b413-1ad0cc22ad3d',
 '5e5cfc80-b610-4f76-9017-dc3f7ca2290d',
 '28495fd4-4d6e-4457-be31-2af4d9b6ac33',
 '9bcfb30a-0ae3-4b00-a4d9-512ab0682ed2',
 '04b9674d-2c96-48d9-8417-b8e9a48151d6',
 'f48beeff-e5b5-4d3d-a708-c97c6dc1ae13',
 '9894c592-6f12-4d3b-86d9-986a3ebc4ad4',
 '36cd5220-f2cd-4d2f-bd43-29692b7401c0',
 '74155b6c-6bcc-4eef-8309-f7ad3beeab9b',
 '157b65c1-4a7e-