In [None]:
import os
import re
import streamlit as st
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, generate_blob_sas
from datetime import datetime, timedelta
from dotenv import load_dotenv
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from openai import AzureOpenAI
from azure.storage.blob import BlobSasPermissions
from azure.storage.blob import ContentSettings


Change the container env variables as per the course data

In [None]:
# Load environment variables from .env file
load_dotenv()

# Set up Azure Blob Storage credentials
AZURE_CONNECTION_STRING = os.getenv("AZURE_CONNECTION_STRING")
AZURE_ACCOUNT_NAME = os.getenv("AZURE_ACCOUNT_NAME")
AZURE_ACCOUNT_KEY = os.getenv("AZURE_ACCOUNT_KEY")
CONTAINER_NAME = os.getenv("CONTAINER_NAME")

# Set up Azure Document Intelligence credentials
FORM_RECOGNIZER_ENDPOINT = os.getenv("AZURE_FORM_RECOGNIZER_ENDPOINT")
FORM_RECOGNIZER_KEY = os.getenv("AZURE_FORM_RECOGNIZER_KEY")

# Set up Azure OpenAI credentials
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT_NAME")

# Set up Azure AI Search credentials
SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
SEARCH_INDEX_NAME = os.getenv("AZURE_SEARCH_INDEX_NAME")
SEARCH_API_KEY = os.getenv("AZURE_SEARCH_API_KEY")

In [None]:
# Initialize clients
blob_service_client = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
document_analysis_client = DocumentAnalysisClient(FORM_RECOGNIZER_ENDPOINT, AzureKeyCredential(FORM_RECOGNIZER_KEY))
openai_client = AzureOpenAI(
    api_key=AZURE_OPENAI_KEY,
    api_version="2023-05-15",  # Use the latest supported version
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)
search_client = SearchClient(endpoint=SEARCH_ENDPOINT, index_name=SEARCH_INDEX_NAME, credential=AzureKeyCredential(SEARCH_API_KEY))

In [None]:
# Function to sanitize document keys
def sanitize_key(key):
    # Replace spaces and invalid characters with underscores
    return re.sub(r"[^a-zA-Z0-9_=-]", "_", key)

In [None]:
from azure.storage.blob import generate_blob_sas, BlobSasPermissions
import os
from datetime import datetime, timedelta
from azure.storage.blob import BlobServiceClient

def upload_to_blob_storage(file_path):
    """
    Uploads a file to Azure Blob Storage and generates a SAS URL valid for 365 days.
    
    Parameters:
    - file_path (str): Local file path

    Returns:
    - tuple: (SAS URL, document_name) if successful, else (None, None)
    """
    try:
        # Extract file name from the path
        document_name = os.path.basename(file_path)

        # Initialize Blob Service Client
        container_client = blob_service_client.get_container_client(CONTAINER_NAME)
        if not container_client.exists():
            container_client.create_container()

        blob_client = container_client.get_blob_client(document_name)

        # Upload the file
        with open(file_path, "rb") as file_data:
            blob_client.upload_blob(file_data, overwrite=True,content_settings=ContentSettings(content_type='application/pdf'))

        print(f"‚úÖ File '{document_name}' uploaded successfully.")

        # Generate SAS token with 365-day expiry
        expiry_time = datetime.utcnow() + timedelta(days=365)

        sas_token = generate_blob_sas(
            account_name=AZURE_ACCOUNT_NAME,
            container_name=CONTAINER_NAME,
            blob_name=document_name,
            account_key=AZURE_ACCOUNT_KEY,
            permission=BlobSasPermissions(read=True, list=True),
            expiry=expiry_time  # ‚úÖ Set expiry to 365 days
        )

        # Construct the SAS URL
        sas_url = f"https://{AZURE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/{document_name}?{sas_token}"
        print(f"‚úÖ SAS URL generated successfully! Expires on: {expiry_time}")
        print(f"‚úÖ SAS URL: {sas_url}")

        return sas_url, document_name  # ‚úÖ Return both SAS URL and document name

    except Exception as e:
        print(f"‚ùå Error uploading file: {e}")
        return None, None

In [None]:
def analyze_layout(document_url):
    """
    Extract text from a document using Azure Form Recognizer.
    """
    try:
        print(f"üîπ Debug: Trying to analyze document from URL: {document_url}")  # ‚úÖ Print SAS URL for debugging
        poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-layout", document_url)
        return poller.result()
    except Exception as e:
        print(f"‚ùå Error analyzing document layout: {e}")
        return None

In [None]:
# Function to generate embeddings using Azure OpenAI
def generate_embedding(text):
    """
    Generate text embeddings using Azure OpenAI.

    Parameters:
    - text (str): Text to embed

    Returns:
    - List[float]: Embedding vector
    """
    try:
        response = openai_client.embeddings.create(
            input=text,
            model=AZURE_OPENAI_DEPLOYMENT_NAME
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"‚ùå Error generating embedding: {e}")
        return None

In [None]:
def process_and_store_document(document_url, document_name):
    """
    Process a document, extract text, generate embeddings, and store in Azure AI Search 
    along with the SAS URL.

    Parameters:
    - document_url (str): The SAS URL of the document.
    - document_name (str): Name of the document.

    Returns:
    - str: Success or error message.
    """
    try:
        # Step 1: Analyze the document layout (extract text)
        result = analyze_layout(document_url)
        if not result:
            return "‚ùå Failed to analyze document."

        # Step 2: Iterate through pages and process each
        for page in result.pages:
            page_text = " ".join([line.content for line in page.lines])  # Extract text
            embedding = generate_embedding(page_text)  # Generate embedding

            # Sanitize the document key
            document_key = sanitize_key(f"{document_name}_{page.page_number}")

            # Step 3: Construct the document for Azure AI Search
            document = {
                "id": document_key,
                "document_name": document_name,  # ‚úÖ File name is now correctly set
                "page_number": page.page_number,
                "content": page_text,  # ‚úÖ Storing extracted text
                "embedding": embedding,  # ‚úÖ Storing embedding
                "sas_url": document_url  # ‚úÖ Storing SAS URL
            }

            # Step 4: Upload the document to Azure AI Search
            search_client.upload_documents(documents=[document])
            print(f"üîπ Debug: Checking Azure Search Index Status")
            print(search_client.get_document_count())

        return f"‚úÖ Document '{document_name}' processed and stored successfully!"
    

    except Exception as e:
        return f"‚ùå Error processing and storing document: {e}"

In [None]:
import os

# Define the folder containing PDFs
folder_path = "/Users/aakashwalavalkar/Desktop/OSHA-safety-insight/OSHA"  # Change this to the actual folder path

# Iterate through each PDF in the folder
for filename in os.listdir(folder_path):
    if filename.lower().endswith(".pdf"):  # Process only PDF files
        file_path = os.path.join(folder_path, filename)

        # Upload and get SAS URL
        sas_url, document_name = upload_to_blob_storage(file_path)

        if sas_url:
            result = process_and_store_document(sas_url, document_name)
            print(f"Processed {filename}: {result}")
        else:
            print(f"Failed to upload {filename}")