In [5]:
import os
import json
import uuid
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    CorsOptions,
    SearchIndex,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchField,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
)
import re

In [None]:
admin_key = ""
endpoint = ''

In [None]:
index_name = 'ct_miner_multi_lingual_index'

fields= [
            SimpleField(name="id", type=SearchFieldDataType.String, key=True),
            SearchableField(name="pdf_name", type=SearchFieldDataType.String, filterable=True),
            SearchableField(name="content", type=SearchFieldDataType.String, filterable=True),
            SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                        searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config")
]

cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)

In [None]:
vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name='my-vector-config',
            kind='hnsw',
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": 'cosine',
            }
        )
    ]
)

In [None]:
index = SearchIndex(
    name=index_name,
    fields=fields,
    cors_options=cors_options,
    vector_search=vector_search
)

In [None]:
admin_client = SearchIndexClient(
                                    endpoint=endpoint,
                                    index_name=index_name,
                                    credential=AzureKeyCredential(admin_key)
                                 ) 

search_client = SearchClient(endpoint=endpoint,
                      index_name=index_name,
                      credential=AzureKeyCredential(admin_key))

In [None]:
admin_client.create_or_update_index(index)

In [None]:
files = os.listdir('../data/scraped_data/all_data_chunked/')
batch_size = 10

In [None]:
count=0

for file in files:
    with open(f"../data/all_chunked_data/{file}") as f:
        d=json.load(f)
    count+=len(d['content'])

In [None]:
ids=set()
count=[]
for i in range(0, len(files), batch_size):
    batch=files[i:i+batch_size]
    res=[]
    for file in batch:
        with open(f'../data/scraped_data/all_data_chunked/{file}') as f:
            d=json.load(f)

        for j in range(len(d['content'])):
            a=d.copy()

            id = file.split('.')[0]+f"_{j}"

            a['id'] = id
            a['content'] = d['content'][j]
            a['content_vector'] = d['content_vector'][j]

            res.append(a)
            if(id in ids):
                print('error')
            ids.add(id)
            
    search_client.upload_documents(documents=res)
    count.append(len(res))
    print(len(res),'done')

In [None]:
results = search_client.search(search_text="*", select=["id"])
item_ids = [result["id"] for result in results]

In [None]:
s=set(item_ids)

failed_ids=[]
for i in ids:
    if(i not in s):
        failed_ids.append(i)

print(failed_ids)

In [None]:
res=[]

for id in failed_ids:
    file=id.split('.')[0]
    chunk_id=int(id.split('_')[1])
    print(id, file, chunk_id)
    with open(f'../data/scraped_data/all_data_chunked/{file}.json') as f:
        d=json.load(f)
    a=d.copy()
    a['id'] = id
    a['content'] = re.sub(r'\ {6,}', '    ', d['content'][chunk_id])
    a['content_vector'] = d['content_vector'][chunk_id]
    res.append(a)

In [None]:
ans=search_client.upload_documents(documents=res)