In [1]:
%pip install embeddingstore --extra-index-url https://azuremlsdktestpypi.azureedge.net/embeddingstore/

Looking in indexes: https://pypi.org/simple, https://azuremlsdktestpypi.azureedge.net/embeddingstore/
Collecting embeddingstore
  Downloading https://azuremlsdktestpypi.blob.core.windows.net/repo/embeddingstore/embeddingstore-0.0.100488717-py3-none-any.whl?sv=2021-10-04&st=2023-07-24T03%3A30%3A19Z&se=2024-07-24T03%3A30%3A19Z&sr=b&sp=rl&sig=tFN%2FTuwEYnbACYTgyNiq8mHai6nzDQHu4h1aMkbfqwU%3D (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.4/84.4 kB[0m [31m225.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting flask>=2.2.3 (from embeddingstore)
  Using cached Flask-2.3.2-py3-none-any.whl (96 kB)
Collecting Werkzeug>=2.3.3 (from flask>=2.2.3->embeddingstore)
  Downloading Werkzeug-2.3.6-py3-none-any.whl (242 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting Jinja2>=3.1.2 (from flask>=2.2.3->embeddingstore)
  Using cached Jinja2-3.1.2-py3-none-any

In [1]:
import os
from typing import List
from dotenv import load_dotenv
import urllib.request
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
from embeddingstore.core.contracts import (
    EmbeddingModelType,
    StorageType,
    StoreCoreConfig,
)
from embeddingstore.core.embeddingstore_core import EmbeddingStoreCore

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

OPENAI_DAVINCI_DEPLOYMENT_NAME = os.getenv("OPENAI_DAVINCI_DEPLOYMENT_NAME")
OPENAI_DAVINCI_MODEL_NAME = os.getenv("OPENAI_DAVINCI_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

In [3]:
URL_PREFIX = "https://learn.microsoft.com/en-us/azure/machine-learning/"
URL_NAME_LIST = [
    "tutorial-azure-ml-in-a-day",
    "overview-what-is-azure-machine-learning",
    "concept-v2",
]

In [4]:
local_file_path = os.path.join(os.getcwd(), "data")
print(local_file_path)
os.makedirs(local_file_path, exist_ok=True)
for url_name in URL_NAME_LIST:
    url = os.path.join(URL_PREFIX, url_name)
    destination_path = os.path.join(local_file_path, url_name)
    urllib.request.urlretrieve(url, destination_path)

/Users/vladfeigin/myprojects/openai/workshops/dataai/openaiworkshop-new/openaiworkshop/Lab #3 - Create embeddings in pandas/data


In [5]:
DIMENSION = 1536

# Configure an embedding store to store index file.
store_path = os.path.join(os.getcwd(), "faiss_index_store")
config = StoreCoreConfig.create_config(
    storage_type=StorageType.LOCAL,
    store_identifier=store_path,
    model_type=EmbeddingModelType.AOAI,
    model_api_base=OPENAI_DEPLOYMENT_ENDPOINT,
    model_api_key=OPENAI_API_KEY,
    model_api_version=OPENAI_DEPLOYMENT_VERSION,
    model_name=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    dimension=DIMENSION,
    create_if_not_exists=True,
)
store = EmbeddingStoreCore(config)

In [6]:
def get_file_chunks(file_name: str) -> List[str]:
    with open(file_name, "r", encoding="utf-8") as f:
        page_content = f.read()
        # use BeautifulSoup to parse HTML content
        soup = BeautifulSoup(page_content, "html.parser")
        text = soup.get_text(" ", strip=True)
        chunks = []
        splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=10)
        for chunk in splitter.split_text(text):
            chunks.append(chunk)
        return chunks

In [8]:
for root, _, files in os.walk(local_file_path):
    for file in files:
        each_file_path = os.path.join(root, file)

        # Split the file into chunks.
        chunks = get_file_chunks(each_file_path)
        count = len(chunks)
        if URL_PREFIX is not None:
            metadatas = [
                {"title": file, "source": os.path.join(URL_PREFIX, file)}
            ] * count
        else:
            metadatas = [{"title": file}] * count

        # Embed chunks into embeddings, generate index in embedding store.
        # If your data is large, inserting too many chunks at once may cause
        # rate limit error，you can refer to the following link to find solution
        # https://learn.microsoft.com/en-us/azure/cognitive-services/openai/quotas-limits
        store.batch_insert_texts(chunks, metadatas)
        print(f"Create index for {file} file successfully.\n")

Create index for recipes_onecol_with_embeddings.csv file successfully.

Create index for tutorial-azure-ml-in-a-day file successfully.

Create index for concept-v2 file successfully.

Create index for recipes_onecol.csv file successfully.

Create index for overview-what-is-azure-machine-learning file successfully.

