# Python Notebook on Data Ingestion and Downlod to Azure Blob Storage

## Importing Libraries

In [None]:
#importing libraries
import os
from azure.storage.blob import BlobServiceClient, ContentSettings
from dotenv import load_dotenv
load_dotenv()

True

## Set environment and folder paths for Azure Blob Storage operations

In [None]:
# 🔧 REQUIRED: Set environment and folder paths for Azure Blob Storage operations

# Azure Storage connection string (loaded from .env file)
connection_string = os.getenv("AZURE_CONNECTION_STRING")

# Name of the Azure Blob container where files will be uploaded
container_name = os.getenv("AZURE_CONTAINER_NAME")

# Local directory containing the documents to be uploaded
local_folder_path = "/Users/sharathsolomon/Desktop/SSAA Chatbot/Development/Documents" 

# Target virtual folder inside the Azure Blob container
target_folder = "Documents"  # This is a logical prefix, not a real folder structure


## Functions to upload documents in Azure Blob Storage

In [None]:
def upload_to_documents_folder(connection_string, container_name, folder_path, target_folder):
    """
    Uploads all files from a local folder (recursively) to a specified virtual folder within an Azure Blob container.

    Args:
        connection_string (str): Azure Storage account connection string.
        container_name (str): Name of the blob container where files will be uploaded.
        folder_path (str): Local folder path containing the documents to upload.
        target_folder (str): Virtual folder path (prefix) inside the blob container.

    Notes:
        - Automatically creates the container if it does not exist.
        - Determines content type based on file extension.
        - Overwrites existing blobs with the same path.
    """
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)

    # Create the container if it doesn't already exist
    try:
        container_client.create_container()
    except Exception:
        pass  # Ignore error if container already exists

    # Walk through all files in the folder recursively
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            file_path = os.path.join(root, filename)

            # Get path relative to base folder to maintain structure
            relative_path = os.path.relpath(file_path, folder_path)

            # Azure uses '/' as a path separator in blob names
            blob_path = os.path.join(target_folder, relative_path).replace("\\", "/")

            # Detect content type for common formats
            content_type = "application/octet-stream"
            if filename.endswith(".pdf"):
                content_type = "application/pdf"
            elif filename.endswith(".docx"):
                content_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            elif filename.endswith(".txt"):
                content_type = "text/plain"

            # Upload the file to Azure Blob Storage
            try:
                with open(file_path, "rb") as data:
                    container_client.upload_blob(
                        name=blob_path,
                        data=data,
                        overwrite=True,
                        content_settings=ContentSettings(content_type=content_type),
                        timeout=300
                    )
                print(f"✅ Uploaded: {blob_path}")
            except Exception as e:
                print(f"❌ Failed to upload {blob_path}: {e}")
                continue


In [None]:
# ✅ CALL THE FUNCTION
upload_to_documents_folder(connection_string, container_name, local_folder_path, target_folder)

✅ Uploaded: Documents/0685i00000CR2gfAAD.docx
✅ Uploaded: Documents/0685i00000IxmTCAAZ.docx
✅ Uploaded: Documents/0685i00000ED4iNAAT.docx
✅ Uploaded: Documents/0685i00000KG4zZAAT.pdf
✅ Uploaded: Documents/0685i00000GMzwSAAT.pdf
✅ Uploaded: Documents/068Mo00000OA0dhIAD.pdf
✅ Uploaded: Documents/0685i00000COnTYAA1.pdf
✅ Uploaded: Documents/0685i00000CPMiuAAH.pdf
✅ Uploaded: Documents/0685i00000JTytPAAT.pdf
✅ Uploaded: Documents/068Mo00000TExCQIA1.pptx
✅ Uploaded: Documents/0685i00000CQRm5AAH.docx
✅ Uploaded: Documents/068J3000004AFZ8IAO.pdf
✅ Uploaded: Documents/0685i00000CRB13AAH.docx
✅ Uploaded: Documents/Final_Member Webinar_Storage Agreements 2023.pdf
✅ Uploaded: Documents/0685i00000CPMvQAAX.pdf
✅ Uploaded: Documents/AU-Privacy-Collection-Statement-220822-.docx
✅ Uploaded: Documents/0685i00000CQGYpAAP.docx
✅ Uploaded: Documents/0685i00000GKMkpAAH.docx
✅ Uploaded: Documents/0685i00000CR2nFAAT.docx
✅ Uploaded: Documents/0685i00000CQRgJAAX.docx
✅ Uploaded: Documents/0685i00000CQINQAA5.d

In [None]:
file_path = "/Users/sharathsolomon/Desktop/SSAA Chatbot/Development/vector_index.faiss"         # Path to the file you want to upload
blob_name = "indexer"   

def upload_file_to_blob(file_path, blob_name, connection_string, container_name):
    """
    Uploads a single file to Azure Blob Storage with a specified blob name.

    Args:
        file_path (str): Local path to the file you want to upload.
        blob_name (str): Name of the blob in Azure Blob Storage.
        connection_string (str): Azure Storage account connection string.
        container_name (str): Name of the blob container.

    Notes:
        - If the blob already exists, it will be overwritten.
        - Assumes the container already exists.
    """
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)

    try:
        # Create a BlobClient for the specific blob
        blob_client = container_client.get_blob_client(blob_name)

        # Read the file in binary mode and upload
        with open(file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)  # Overwrites if blob exists

        print(f"✅ File '{file_path}' uploaded to Blob Storage as '{blob_name}'.")

    except Exception as e:
        print(f"❌ Error uploading file: {e}")

# ✅ CALL THE FUNCTION
upload_file_to_blob(file_path, blob_name, connection_string, container_name)


File '/Users/sharathsolomon/Desktop/SSAA Chatbot/Development/vector_index.faiss' uploaded to Blob Storage as 'indexer'.


## Function to delete files from Azure Blob Storage

In [None]:
def delete_all_blobs_in_container(connection_string, container_name):
    """
    Deletes all blobs in the specified Azure Blob Storage container.

    Args:
        connection_string (str): Azure Storage account connection string.
        container_name (str): Name of the container to delete blobs from.

    Notes:
        - This operation is irreversible. Use with caution in production.
        - The container itself is not deleted, only its contents.
    """
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)

    # List and delete all blobs in the container
    blobs = container_client.list_blobs()
    for blob in blobs:
        container_client.delete_blob(blob.name)
        print(f"🗑️ Deleted: {blob.name}")

    print("✅ All blobs deleted.")

# ✅ CALL THE FUNCTION
connection_string = os.getenv("AZURE_CONNECTION_STRING")
container_name = os.getenv("AZURE_CONTAINER_NAME")
delete_all_blobs_in_container(connection_string, container_name)


✅ All blobs deleted.


## Function to download files from Azure Blob Storage

In [None]:
# 📁 Azure Blob Storage: Model & Index Download Script

model_folder = "Models"                       # Folder in Blob Storage where model files are stored
local_model_folder = "./models"               # Local directory where model files will be saved

# ✅ Initialize Azure Blob service client
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

# 📥 Function to download a single file from Azure Blob Storage
def download_blob_from_storage(blob_name, download_path):
    """
    Downloads a single blob from Azure Blob Storage to a specified local path.

    Args:
        blob_name (str): The name (path) of the blob in Azure Blob Storage.
        download_path (str): The full local file path to save the downloaded file.
    """
    try:
        blob_client = container_client.get_blob_client(blob_name)

        with open(download_path, "wb") as download_file:
            download_file.write(blob_client.download_blob().readall())

        print(f"✅ Downloaded '{blob_name}' to '{download_path}'.")

    except Exception as e:
        print(f"❌ Error downloading '{blob_name}': {e}")

# 📥 Function to download all files within a specific folder in Blob Storage
def download_all_files_in_folder():
    """
    Downloads all blobs from a specified folder (prefix) in Azure Blob Storage to the local directory.
    Maintains the folder structure.
    """
    try:
        blobs = container_client.list_blobs(name_starts_with=model_folder)

        for blob in blobs:
            blob_name = blob.name
            relative_path = os.path.relpath(blob_name, model_folder)
            local_file_path = os.path.join(local_model_folder, relative_path)

            # Ensure local directory exists
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

            # Download each blob
            download_blob_from_storage(blob_name, local_file_path)

    except Exception as e:
        print(f"❌ Error downloading files from folder '{model_folder}': {e}")

# ✅ Ensure local model directory exists
os.makedirs(local_model_folder, exist_ok=True)

# 🚀 Download all model files
download_all_files_in_folder()

# 📦 Download vector index file separately
blob_name = 'indexer'
download_path = './vector_index.faiss'
download_blob_from_storage(blob_name, download_path)


Downloaded 'Models/my_miniLM_model/1_Pooling/config.json' to './models/my_miniLM_model/1_Pooling/config.json'.
Downloaded 'Models/my_miniLM_model/README.md' to './models/my_miniLM_model/README.md'.
Downloaded 'Models/my_miniLM_model/config.json' to './models/my_miniLM_model/config.json'.
Downloaded 'Models/my_miniLM_model/config_sentence_transformers.json' to './models/my_miniLM_model/config_sentence_transformers.json'.
Downloaded 'Models/my_miniLM_model/model.safetensors' to './models/my_miniLM_model/model.safetensors'.
Downloaded 'Models/my_miniLM_model/modules.json' to './models/my_miniLM_model/modules.json'.
Downloaded 'Models/my_miniLM_model/sentence_bert_config.json' to './models/my_miniLM_model/sentence_bert_config.json'.
Downloaded 'Models/my_miniLM_model/special_tokens_map.json' to './models/my_miniLM_model/special_tokens_map.json'.
Downloaded 'Models/my_miniLM_model/tokenizer.json' to './models/my_miniLM_model/tokenizer.json'.
Downloaded 'Models/my_miniLM_model/tokenizer_conf