# Pre-req
- To run SynapseML in Fabric, you need F64 capacity or above


In [1]:
import synapse.ml.services

print(f"SynapseML Services version: {synapse.ml.services.__version__}")

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 3, Finished, Available, Finished)

SynapseML Services version: 1.0.4


In [2]:
# Import
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.functions as F

import synapse.ml.core
import synapse.ml.services
from synapse.ml.services.openai import *

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 4, Finished, Available, Finished)

In [3]:
# Configuration
keyVaultURI = "https://sbdemokvfabic.vault.azure.net/"
openAIServiceName = "sbdemoaoai"
aiSearchServiceName = "sbdemoass002"
embeddingModelName = "demo-text-e3s" # Model: text-embedding-3-small

# Retrive OpenAI and AI Search keys from Key Vault
openAIkey = mssparkutils.credentials.getSecret(keyVaultURI,openAIServiceName)
aiSearchKey = mssparkutils.credentials.getSecret(keyVaultURI,aiSearchServiceName)

#OneLake
oneLake = "CS_Lakehouse"

print(openAIkey)
print(aiSearchKey)

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 5, Finished, Available, Finished)

[REDACTED]
[REDACTED]


In [4]:
# Read data from CS OneLake

df = spark.read.table(oneLake + "." + "Processed_CustomerPurchase")
df = df.where(df['OrderDate'] >= '2024-05-01')

print(df.count())

display(df.limit(10))

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 6, Finished, Available, Finished)

1793


SynapseWidget(Synapse.DataFrame, 8bd1fc57-bac6-453d-a6b5-97a902d32109)

In [5]:
# Create Embedding text
# For the generated fields, we are going to prefix those fields with "p_"

embeddingTextFormat = "On %s, Order ID (%s) - Order Line ID (%s) sold %s quantity of '%s' at $%s each. The Line total is $%s."

dfCombined = df.withColumn(
                    "p_TextForEmbedding",
                    F.format_string(
                        embeddingTextFormat, 
                        F.date_format(df.OrderDate, "MMM dd yyyy"), 
                        df.OrderID,
                        df.OrderLineID,
                        df.Quantity,
                        df.Description,
                        F.format_number(df.UnitPrice,2),
                        F.format_number(df.ItemSoldTotal,2)
                    )
            )

print(dfCombined.count())

display(dfCombined.select("p_TextForEmbedding").limit(10))

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 7, Finished, Available, Finished)

1793


SynapseWidget(Synapse.DataFrame, c1e7c957-047d-43e6-ba1d-3e0f01f9e2ec)

In [6]:
# OpenAI - Embedding

embeddingService = (
    OpenAIEmbedding()
        .setCustomServiceName(openAIServiceName)
        .setSubscriptionKey(openAIkey)
        .setDeploymentName(embeddingModelName)
        .setTextCol("p_TextForEmbedding")
        .setErrorCol("p_EmbeddingError")
        .setOutputCol("p_Embeddings")
)

dfEmbedded = embeddingService.transform(dfCombined)


# Select the key fields and save the embedding
dfEmbedded = dfEmbedded.select("OrderLineID", "OrderDate", "OrderID", "Quantity", "Description", "UnitPrice", "ItemSoldTotal", "p_TextForEmbedding", "p_Embeddings", "p_EmbeddingError")
dfEmbedded.write.format("delta").mode("overwrite").saveAsTable("Embedded_CustomerPurchase") # Overwriting for demo purpose.

display(dfEmbedded.select("p_TextForEmbedding", "p_EmbeddingError", "p_Embeddings").limit(10))

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 3cc58d14-b905-4b26-adbb-2a1c6db5eb02)

In [7]:
# Check if we have any embedding error
display(dfEmbedded.filter(col('p_EmbeddingError').isNotNull()))

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a80fb726-6d94-4268-b132-963ea466ede6)

In [8]:
# To retrieve the embedding from Lakehouse
dfEmbedded = spark.read.table("Embedded_CustomerPurchase")

print(dfEmbedded.count())

display(dfEmbedded.limit(10))

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 10, Finished, Available, Finished)

1793


SynapseWidget(Synapse.DataFrame, 5488c7b4-aaec-4a07-bb2c-f1573643b5ce)

In [9]:
# AI Search Index Config

import requests
import json

aiSearchApiVersion = "2023-11-01"
aiSearchIndexName = "demo-customer-purchase-index"
embeddingLength = 1536

# AI Search - Document config
docBatchSize = 250

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 11, Finished, Available, Finished)

In [10]:
# Using AI Search API to create index

# Create index for AI Search with fields id, content, and contentVector
# Note the datatypes for each field below
payload = json.dumps(
    {
        "name": aiSearchIndexName,
        "fields": [
            {
                "name": "OrderLineId",
                "type": "Edm.String",
                "key": True,
                "filterable": True
            },
            {
                "name": "OrderId",
                "type": "Edm.Int32",
                "filterable": True
            },
            {
                "name": "OrderDate",
                "type": "Edm.String",
                "filterable": True
            },
            {
                "name": "Content",
                "type": "Edm.String",
                "searchable": True,
                "retrievable": True
            },
            {
                "name": "Embeddings",
                "type": "Collection(Edm.Single)",
                "searchable": True,
                "retrievable": True,
                "dimensions": embeddingLength,
                "vectorSearchProfile": "vectorConfig",
            },
        ],
        "vectorSearch": {
            "algorithms": [{"name": "hnswConfig", "kind": "hnsw", "hnswParameters": {"metric": "cosine"}}],
            "profiles": [{"name": "vectorConfig", "algorithm": "hnswConfig"}],
        },
    }
)

url = f"https://{aiSearchServiceName}.search.windows.net/indexes/{aiSearchIndexName}?api-version={aiSearchApiVersion}"
headers = {"Content-Type": "application/json", "api-key": aiSearchKey}

response = requests.request("PUT", url, headers=headers, data=payload)

if response.status_code == 201:
    print("Index created!")
elif response.status_code == 204:
    print("Index updated!")
else:
    print(f"HTTP request failed with status code {response.status_code}")
    print(f"HTTP response body: {response.text}")

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 12, Finished, Available, Finished)

Index created!


In [11]:
# Using AI Search API to add data to index

# Add record to index
def insertIntoIndex(documents):
    payload = json.dumps({"value": documents})

    url = f"https://{aiSearchServiceName}.search.windows.net/indexes/{aiSearchIndexName}/docs/index?api-version={aiSearchApiVersion}"
    headers = {"Content-Type": "application/json", "api-key": aiSearchKey}

    response = requests.request("POST", url, headers=headers, data=payload)

    if response.status_code == 200 or response.status_code == 201:
        return "Success"
    else:
        return f"Failure: {response.text}"

# break the data into smaller batches
def uploadRows(rows):
    rows = list(rows)

    for i in range(0, len(rows), docBatchSize):
        row_batch = rows[i : i + docBatchSize]
        documents = []

        for row in rows:
            documents.append(
                {
                    "OrderLineId": row["OrderLineId"],
                    "OrderId": row["OrderId"],
                    "OrderDate": row["OrderDate"],
                    "Content": row["Content"],
                    "Embeddings": row["Embeddings"].tolist(),
                    "@search.action": "upload",
                },
            )
        status = insertIntoIndex(documents)
        yield [row_batch[0]["OrderLineId"], row_batch[-1]["OrderLineId"], status]


# Add the required columns and upload the rows
result = dfEmbedded \
            .withColumn("OrderLineId", col("OrderLineId").cast(StringType())) \
            .withColumn("OrderId", col("OrderId").cast(StringType())) \
            .withColumn("OrderDate", date_format('OrderDate', 'yyyy-MM-dd')) \
            .withColumn("Content", col('p_TextForEmbedding')) \
            .withColumn("Embeddings", col('p_Embeddings')) \
            .select("OrderLineId", "OrderId", "OrderDate", "Content", "Embeddings") \
            .rdd.mapPartitions(uploadRows)

display(result.toDF(["startId", "endId", "uploadStatus"]))

StatementMeta(, f3ed240c-deab-4da3-95e2-6d34f945df8f, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 7d5b1182-b3e8-4191-b9a2-5e41a796f780)

# Reference
- [Microsoft Fabric - Embedding with Synapse ML library](https://learn.microsoft.com/en-us/fabric/data-science/ai-services/how-to-use-openai-sdk-synapse?tabs=synapseml#embeddings?wt.mc_id=MVP_365600)
- [Azure AI Search - Create Index](https://learn.microsoft.com/en-us/rest/api/searchservice/create-index#bkmk_indexAttrib?wt.mc_id=MVP_365600)
- [Azure AI Search - Index Data Type](https://learn.microsoft.com/en-us/rest/api/searchservice/data-type-map-for-indexers-in-azure-search?wt.mc_id=MVP_365600)


