In [1]:
# import installed libraries
from dotenv import load_dotenv
import os
import json
from openai import AzureOpenAI
import re
import openai
import asyncio
import nest_asyncio


# load environment variables
load_dotenv(override=True)

True

In [2]:
# define openai Client
aoai_api_key = os.getenv("AZURE_OPENAI_KEY")
aoai_api_endpoint =  os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

client = AzureOpenAI(
    api_key=aoai_api_key,
    api_version="2024-08-01-preview",
    azure_endpoint=aoai_api_endpoint
)

In [3]:
original_text = os.path.join(os.getcwd(), "data", "DRAFT_Acolad_2023 - English.md")
translated_text = os.path.join(os.getcwd(), "data", "DRAFT_Acolad_2023_V2 - English.md")

with open(original_text, 'r') as file:
    original_text = file.read()

with open(translated_text, 'r') as file:
    translated_text = file.read()
    
    

In [19]:
import numpy as np
from langchain.text_splitter import MarkdownTextSplitter

def get_embedding(text, deployment="text-embedding-ada-002"):
    response = client.embeddings.create(
        input=text,
        model=deployment  # In Azure, use the deployment name here
    )
    return response.data[0].embedding

# Function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Chunk markdown
def chunk_markdown(text, chunk_size=1000):
    splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    return splitter.split_text(text)


In [21]:
# Chunk both texts
original_chunks = chunk_markdown(original_text)
translated_chunks = chunk_markdown(translated_text)

# Ensure both have the same number of chunks
min_len = min(len(original_chunks), len(translated_chunks))
original_chunks = original_chunks[:min_len]
translated_chunks = translated_chunks[:min_len]

# Compute embeddings for each chunk
original_embeddings = [get_embedding(chunk) for chunk in original_chunks]
translated_embeddings = [get_embedding(chunk) for chunk in translated_chunks]

# Compute similarity scores per chunk and prepare JSON output
chunk_results = []
for i, (orig, trans, orig_emb, trans_emb) in enumerate(zip(original_chunks, translated_chunks, original_embeddings, translated_embeddings)):
    similarity = cosine_similarity(orig_emb, trans_emb)
    chunk_results.append({
        "chunk_id": i + 1,
        "original_chunk": orig,
        "translated_back_chunk": trans,
        "similarity_score": round(similarity, 4)
    })

# Write results to a JSON file
output_filename = "chunk_similarity_results.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(chunk_results, f, indent=4, ensure_ascii=False)

print(f"✅ Results saved to {output_filename}")

✅ Results saved to chunk_similarity_results.json
