<h1>Text Embedding Generation</h1>

In [1]:
import io
import os
import pandas as pd
import json
import teradataml as tdml
import getpass
import embeddingSQL as sqlGen
import time
from sentence_transformers import SentenceTransformer


In [2]:
# read the configuration file
pdfParserConfig = json.load(open('./manConfig.json'))
td_host = pdfParserConfig["db"]["hostName"]
u_name = pdfParserConfig["db"]["userName"]
db_name = pdfParserConfig["db"]["dbName"]
prefix_name = pdfParserConfig["db"]["tablePrefix"]
db_logmech = pdfParserConfig["db"].get("logmech", "LDAP")  # default to LDAP
model_name_short = pdfParserConfig["model"]["modelNameShort"]
fixedEmbeddingDim = pdfParserConfig["model"]["fixedEmbeddingDim"]

table_name_source = f"{prefix_name}_source_docs"
table_name_contents = f"{prefix_name}_doc_contents"

## Notebook parameters
content_path = pdfParserConfig["data"]["contentPath"]
debug = pdfParserConfig["log"]["debug"]

u_password = "td01"

In [3]:
## Set up your connection information
if len(u_name) == 0:
    u_name = input("User Name: ")

if len(u_password) == 0:
    u_password = getpass.getpass(prompt="Password:  ")

tdml.create_context(host = td_host, username=u_name, password = u_password, database = db_name, logmech = db_logmech)


Engine(teradatasql://td01:***@192.168.100.20?DATABASE=td01&LOGDATA=%2A%2A%2A&LOGMECH=%2A%2A%2A)

<div style="background-color:coral; padding:8px; border-radius:15px">
<h2>Read text segments and generate embeddings</h2>
<b>Server Process</b>
<ol>
    <li>Save the model to the database.  <a href="https://huggingface.co/Teradata/bge-small-en-v1.5" target=_new>Teradata/bge-small-en-v1.5</a></li>
    <li>Ask the database to <a href="https://docs.teradata.com/r/Enterprise_IntelliFlex_Lake_VMware/Teradata-VantageTM-Bring-Your-Own-Model-User-Guide/BYOM-Functions/ONNXEmbeddings" target=_new>Generate embeddings</a></li>
</ol>
</div>

In [None]:

start = time.time()

# 1) Source rows to embed (must yield: doc_id, text_id, txt)
sqlSourceData = f"SELECT doc_id, text_id, txt FROM {table_name_contents} tc JOIN {table_name_source} ts ON tc.doc_id = ts.id WHERE status_cd = 1"

# If you know the schema where ONNXEmbeddings is registered, set it here.
# If you are not sure yet, leave empty "" to try unqualified.
byom_schema = "mldb"                 # change if mapped elsewhere; or set "" to try unqualified
schema_prefix = f"{byom_schema}." if byom_schema else ""

# 2) BYOM SELECT (returns emb_0..emb_xxx FLOAT32 columns) — use TABLE (...) wrapper
byom_select = f"""
SELECT *
FROM 
    {schema_prefix}ONNXEmbeddings(
    ON ({sqlSourceData}) AS InputTable
    ON (SELECT model_id, model FROM embeddings_models WHERE model_id = '{model_name_short}') DIMENSION
    ON (SELECT model_id, model AS tokenizer FROM embeddings_tokenizers WHERE model_id = '{model_name_short}') DIMENSION
    USING
      Accumulate('doc_id','text_id','txt')
      ModelOutputTensor('sentence_embedding')
      OutputFormat('FLOAT32({fixedEmbeddingDim})')
 ) AS emb
"""

# 3) Target table setup: CREATE or INSERT
table_name = f"{prefix_name}_doc_embeddings_{fixedEmbeddingDim}"
createTable = False
try:
    _ = tdml.DataFrame(table_name)
except Exception:
    createTable = True

# 4) Build final SQL
if createTable:
    embedSQL = f"CREATE TABLE {table_name} AS (\n{byom_select}\n) WITH DATA;"
else:
    embedSQL = f"INSERT INTO {table_name}\n{byom_select}"

print(embedSQL)        # sanity check — should show TABLE(ONNXEmbeddings(...))

# # 5) Execute
tdml.execute_sql(embedSQL)


CREATE TABLE GA_doc_embeddings_384 AS (

SELECT *
FROM 
    mldb.ONNXEmbeddings(
    ON (SELECT doc_id, text_id, txt FROM GA_doc_contents tc JOIN GA_source_docs ts ON tc.doc_id = ts.id WHERE status_cd = 1) AS InputTable
    ON (SELECT model_id, model FROM embeddings_models WHERE model_id = 'bge-small-en-v1.5') DIMENSION
    ON (SELECT model_id, model AS tokenizer FROM embeddings_tokenizers WHERE model_id = 'bge-small-en-v1.5') DIMENSION
    USING
      Accumulate('doc_id','text_id','txt')
      ModelOutputTensor('sentence_embedding')
      OutputFormat('FLOAT32(384)')
 ) AS emb

) WITH DATA;


In [None]:
tdml.remove_context()