# 1. Text Chunking and Embedding with Azure OpenAI  

In this notebook, we will process raw text data to prepare it for indexing in Azure AI Search. This involves several steps including loading the raw text data, chunking it into manageable pieces, generating embeddings for each chunk using a pre-trained model, and saving the chunked and embedded text into JSON files. These JSON files will be used in the subsequent notebook to create and populate an Azure AI Search index.

## 1.1 Import Libraries and Load Environment Variables

In [1]:
# Import necessary libraries
import os
import json
from dotenv import load_dotenv
from openai import AzureOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load environment variables from .env file
load_dotenv('.env')

# Retrieve Azure OpenAI Service details from environment variables
azure_openai_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
azure_openai_key = os.getenv('AZURE_OPENAI_KEY')
azure_openai_embedding_model = os.getenv('AZURE_OPENAI_EMBEDDING_MODEL_NAME')
azure_openai_embedding_deployment = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT')
azure_openai_api_version = os.getenv('AZURE_OPENAI_API_VERSION')

# Initialize Azure OpenAI client
client = AzureOpenAI(
    azure_deployment=azure_openai_embedding_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key
)


## 1.2 Define Helper Functions

In [2]:
# Function to read text files from a folder
def read_text_files(folder_path):
    text_files = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
                text_files.append((file_name, file.read()))
    return text_files

# Function to chunk text using LangChain
def chunk_text(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=0
    )
    return text_splitter.split_text(text)

# Function to get embeddings using Azure OpenAI Service
def get_embeddings(text_chunks):
    embeddings = []
    for chunk in text_chunks:
        response = client.embeddings.create(input=[chunk], model=azure_openai_embedding_model)
        embeddings.append(response.data[0].embedding)
    return embeddings

# Function to save results to a JSON file
def save_to_json(file_name, chunked_data, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, file_name.replace('.txt', '.json'))
    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(chunked_data, json_file, ensure_ascii=False, indent=4)

## 1.3 Process Each Text File

In [3]:
# Define folder paths
documents_folder = 'documents'
embeddings_folder = 'embeddings'

# Read text files from the documents folder
text_files = read_text_files(documents_folder)

# Process each text file
for file_name, text in text_files:
    # Chunk the text
    text_chunks = chunk_text(text)

    # Get embeddings for each chunk
    chunk_embeddings = get_embeddings(text_chunks)

    # Prepare data for saving
    chunked_data = [{
        'chunk_id': idx,
        'chunk_text': chunk,
        'chunk_embedding': embedding
    } for idx, (chunk, embedding) in enumerate(zip(text_chunks, chunk_embeddings))]

    # Save the chunked data and embeddings to a JSON file
    save_to_json(file_name, chunked_data, embeddings_folder)  