In [None]:
import os
from dotenv import load_dotenv
from azure.search.documents.indexes import SearchIndexClient
from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI
from lib.common import create_search_index
from lib import utils


# Specify the path to the azure.env file
env_path = '.env'

# Load the environment variables from the file
load_dotenv(env_path)

### 1. Set up Azure clients and create AI Search index

In [None]:

search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
search_credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"])
index_name = "ai-search-10k-reports"
search_index_client = SearchIndexClient(endpoint=search_endpoint, 
                                        credential=search_credential)


rts_searchindex = create_search_index(
    index_name,
    os.getenv("AZURE_OPENAI_ENDPOINT"),
    "text-embedding-ada-002",
    os.getenv("AZURE_OPENAI_API_KEY")
)

try:
    search_index_client.create_or_update_index(rts_searchindex)
    print("Created recursive text splitter index")

except Exception as e:
    print("Error creating recursive text splitter index")
    print(e)


In [None]:
# Create a DefaultAzureCredential object
credential = DefaultAzureCredential()

# Create a BlobServiceClient object with DefaultAzureCredential
blob_service_client = BlobServiceClient(account_url=os.getenv("AZURE_STORAGE_ACCOUNT_URL"), 
                                        credential=credential)

# Specify the name of the container
container_name = "gbb-hackathon"

# Get a reference to the container
container_client = blob_service_client.get_container_client(container_name)

# List all the blobs in the container
blobs = container_client.list_blobs()


In [None]:
# Get OpenAI client 
openai_client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = "2023-05-15",
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"))

#### 2. Process documents

In [None]:
for blob in blobs:
    if blob.name.lower().endswith('.pdf') and not os.path.exists(f"./tmp/{blob.name}"):
        company = blob.name.split("/")[-1].split("_")[0].upper()
        print(f"Processing {blob.name} for company: {company}")
        
        local_path = utils.download_blob_content(blob_service_client, container_name, blob.name, "./tmp")
        print(f"Parsing PDF at {local_path}...")
        
        pages = utils.parse_pdf(local_path)
        if len(pages) == 0:
            print(f"Error parsing {local_path}. Skipping...")
            continue
        else:
          print(f"Splitting text for {company}...")
          chunks = utils.split_text(pages)
          
          print(f"Generating embeddings for {company}, nb chunks: {len(chunks)}...")
          embeddings = utils.generate_chunk_embeddings(openai_client, chunks)
          
          print(f"Uploading documents for {local_path}...")
          utils.upload_embeddings_to_search(search_index_client, index_name, embeddings, chunks, company, blob.name)
    else:
        print(f"skipping {blob.name} as it is not a pdf or it has already been processed.")

#### 3. Clean up

In [None]:
import shutil

# Specify the path to the tmp folder
folder_path = "./tmp"

# Delete the entire folder
shutil.rmtree(folder_path)

print("The tmp folder has been deleted.")
