In [1]:
import pandas as pd
import numpy as np
import requests
import json
from opensearchpy import OpenSearch

In [10]:
df = pd.read_csv('../data/vgenome_sample_1k.csv', index_col=0)
df.head()

Unnamed: 0,image_id,url,tags
0,1,https://cs.stanford.edu/people/rak248/VG_100K_...,"['windows', 'car', 'man', 'trees', 'back', 'tr..."
1,2,https://cs.stanford.edu/people/rak248/VG_100K/...,"['sign', 'sidewalk', 'window', 'backpack', 'sn..."
2,3,https://cs.stanford.edu/people/rak248/VG_100K/...,"['photos', 'wireless phone', 'chain', 'compute..."
3,4,https://cs.stanford.edu/people/rak248/VG_100K/...,"['carpet', 'chair', 'cloths', 'colour', 'frame..."
4,5,https://cs.stanford.edu/people/rak248/VG_100K/...,"['windows', 'chair', 'colour', 'paper', 'cable..."


In [11]:
df.shape

(1081, 3)

In [12]:
import ast

df = df.rename(columns={'tags': 'tags_list'})

df['tags_list'] = df['tags_list'].apply(ast.literal_eval)

df['tags'] = df['tags_list'].apply(lambda x: " ".join(x))
df.head()

Unnamed: 0,image_id,url,tags_list,tags
0,1,https://cs.stanford.edu/people/rak248/VG_100K_...,"[windows, car, man, trees, back, tree trunk, a...",windows car man trees back tree trunk arm bike...
1,2,https://cs.stanford.edu/people/rak248/VG_100K/...,"[sign, sidewalk, window, backpack, sneakers, t...",sign sidewalk window backpack sneakers tree bu...
2,3,https://cs.stanford.edu/people/rak248/VG_100K/...,"[photos, wireless phone, chain, computer tower...",photos wireless phone chain computer tower hai...
3,4,https://cs.stanford.edu/people/rak248/VG_100K/...,"[carpet, chair, cloths, colour, frame, drape, ...",carpet chair cloths colour frame drape table t...
4,5,https://cs.stanford.edu/people/rak248/VG_100K/...,"[windows, chair, colour, paper, cables, color,...",windows chair colour paper cables color shelf ...


In [13]:
json_file_path = '../configs/schema.json'

with open(json_file_path, 'r') as file:
    schema = json.load(file)

new_settings = {
  "settings": {
    "index.knn": True,
    "default_pipeline": "tags-ingest-pipeline"
  }
}

new_mappings = {
  "mappings": {
    "properties": {
      "tag_embedding": {
        "type": "knn_vector",
        "dimension": 384,
        "method": {
          "engine": "lucene",
          "space_type": "l2",
          "name": "hnsw",
          "parameters": {}
        }
      }      
    }
  }
}

settings = {**new_settings['settings'], **schema['settings']}
properties = {**new_mappings['mappings']['properties'], **schema['mappings']['properties']}

schema['settings'] = settings
schema['mappings']['properties'] = properties
mr.JSON(schema, level=2)

In [14]:
url = "http://localhost:9200/tags_db"

headers = {
    'Content-Type': 'application/json'
}

payload = schema
response = requests.delete(url, headers=headers)

response = requests.put(url, headers=headers, data=json.dumps(payload))
mr.JSON(response.json(), level=4)

In [24]:
host = 'localhost'
port = 9200

client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # Enable HTTP compression
    use_ssl=False,       # Set to True if SSL is enabled on your cluster
    verify_certs=False   # Set to True if SSL certificates should be verified
)

# Indexing DataFrame into OpenSearch
index_name = 'tags_db'

docs = df[['image_id', 'tags']].to_dict(orient='records')

actions = [
    f'{{"index": {{"_index": "{index_name}", "_id": "{doc["tags"]}"}}}}\n{json.dumps(doc)}'
    for doc in docs
]

len(actions)

1081

In [26]:
actions[:2]

['{"index": {"_index": "tags_db", "_id": "windows car man trees back tree trunk arm bike clock sign pants van lamp post tree glasses jacket street shoes chin shirt wall headlight sidewalk building shade parking meter"}}\n{"image_id": 1, "tags": "windows car man trees back tree trunk arm bike clock sign pants van lamp post tree glasses jacket street shoes chin shirt wall headlight sidewalk building shade parking meter"}',
 '{"index": {"_index": "tags_db", "_id": "sign sidewalk window backpack sneakers tree building crosswalk pole car walk sign man street light bike road lights"}}\n{"image_id": 2, "tags": "sign sidewalk window backpack sneakers tree building crosswalk pole car walk sign man street light bike road lights"}']

In [28]:
import json

# Prepare JSON lines
json_lines = []
for _, row in df.iterrows():
    index_entry = {"index": {"_index": "tags_db", "_id": row["image_id"]}}
    tags_entry = {"tags": row["tags"]}
    json_lines.append(json.dumps(index_entry))
    json_lines.append(json.dumps(tags_entry))

# Join JSON lines into a single string
result = "\n".join(json_lines)

In [32]:
response = client.bulk(index=index_name, body=result)

In [34]:
url = "http://localhost:9200/tags_db/_search"

payload = {
  "query": {
    "match_all": {}
  },
  "track_total_hits": "true",
  "size": 0
}

response = requests.post(url, headers=headers, data=json.dumps(payload))
mr.JSON(response.json(), level=3)

In [36]:
df.shape[0], response.json()['hits']['total']['value']

(1081, 903)

For some reason, not all images are loaded, but that's fine.