In [15]:
from opensearchpy import OpenSearch, RequestsHttpConnection,  AWSV4SignerAuth
import boto3
import pandas as pd


In [10]:
region = 'eu-west-2'
service = 'aoss'
credentials = boto3.Session(profile_name='vrt-analytics-engineer-nonsensitive').get_credentials()
auth = AWSV4SignerAuth(credentials, region, service)

vectorstore = OpenSearch(
   hosts = [{'host': "epcavlvwitam2ivpwv4k.eu-west-2.aoss.amazonaws.com", 'port': 443}],
   http_auth = auth,
   use_ssl = True,
   verify_certs = True,
   connection_class = RequestsHttpConnection,
   pool_maxsize = 20
)

In [29]:
# Create index
index_body = {
  'settings': {
    "index.knn": True
  },
  "mappings": {
    "properties": {
      "vrtmax_catalog_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "method": {
          "engine": "faiss",
          "name": "hnsw",
          "space_type": "l2"
        }
      }
    }
  }
}

response = vectorstore.indices.create('aoss-index', body=index_body)

In [13]:
# embed stuff and write to opensearch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
model_name = "NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers"
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [30]:
from langchain.vectorstores import OpenSearchVectorSearch


vector = OpenSearchVectorSearch(
  embedding_function = embeddings,
  index_name = 'aoss-index',
  http_auth = auth,
  use_ssl = True,
  verify_certs = True,
  connection_class = RequestsHttpConnection,
  opensearch_url="https://epcavlvwitam2ivpwv4k.eu-west-2.aoss.amazonaws.com"
)

In [23]:
df = pd.read_csv("vrtmax_catalog.csv")

In [24]:
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter

loader = DataFrameLoader(df, page_content_column="info_to_embed")
catalog = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
catalog_chunks = text_splitter.split_documents(catalog)

In [32]:
vector.add_documents(documents=catalog_chunks,vector_field = "vrtmax_catalog_vector",bulk_size=len(df)+1)