## Best Practices for Creating Embeddings in HANA Cloud using GenAI Hub


This notebook demonstrates how we can create embeddings for each of text records and store them in HANA Vector Store for subsequent usage.

### Import Section

In [1]:
# Import section
import os
import json
import math
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv(override=True)

from hana_ml import ConnectionContext
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from gen_ai_hub.proxy.native.openai import embeddings
from hdbcli import dbapi

### Dataset

A scientifically grounded nutrition dataset, licensed under the MIT license, was sourced from HuggingFace and enhanced with synthetic data generated by a large language model (LLM).

HuggingFace dataset: [Link](https://huggingface.co/datasets/Aashi/Science-Text-Data)

In [2]:
# Load CSV file
csv_path = '../sample_files/science-data-sample.csv'
df = pd.read_csv(csv_path, low_memory=False)
df.head()

Unnamed: 0,Topic,Difficulty Level,Category
0,What indicates the presence of proteins in a ...,Easy,Nutrition
1,What indicates the presence of starch in food...,Easy,Nutrition
2,How can you test for fats in food? To test fo...,Medium,Nutrition
3,What is a test for proteins in food items? A ...,Medium,Nutrition
4,What are deficiency diseases? Deficiency dise...,Easy,Health


In [3]:
# Define columns
METADATA_COLS = ["Difficulty Level", "Category"]  # Metadata columns
TEXT_COL = "Topic"  # Document text
VECTOR_COL = "MY_VECTOR"  # Embedding column

### Chunking Strategy

In [4]:
# Function to split text into smaller chunks
# Based on the document structure, the chunking strategy can be changed.
def chunk_text(text, chunk_size=500):
    """Splits text into fixed-length chunks."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

In [5]:
# Process data for embeddings
processed_rows = []
for _, row in df.iterrows():
    metadata = {col: str(row[col]) for col in METADATA_COLS}  # Convert metadata to JSON
    chunks = chunk_text(str(row[TEXT_COL]))  # Chunk text
    for chunk in chunks:
        processed_rows.append([chunk, json.dumps(metadata)])  # Store text & metadata JSON


In [6]:
# Create processed DataFrame
processed_df = pd.DataFrame(processed_rows, columns=["MY_TEXT", "MY_METADATA"])

processed_df.head()

Unnamed: 0,MY_TEXT,MY_METADATA
0,What indicates the presence of proteins in a ...,"{""Difficulty Level"": ""Easy"", ""Category"": ""Nutr..."
1,ter accuracy in scientific investigations. Thi...,"{""Difficulty Level"": ""Easy"", ""Category"": ""Nutr..."
2,"iscovery of thousands of exoplanets, particula...","{""Difficulty Level"": ""Easy"", ""Category"": ""Nutr..."
3,of experimental techniques allows for more pre...,"{""Difficulty Level"": ""Easy"", ""Category"": ""Nutr..."
4,What indicates the presence of starch in food...,"{""Difficulty Level"": ""Easy"", ""Category"": ""Nutr..."


In [7]:
# Initialize GenAI Hub Proxy Client to access models
proxy_client = get_proxy_client('gen-ai-hub')

# Function for batch-wise embedding generation for better performance.
def get_batch_embeddings(text_list, model="text-embedding-ada-002"): # You may choose a different embedding model available on GenAI Hub
    """Generates embeddings in batch."""
    response = embeddings.create(model_name=model, input=text_list)
    return [res.embedding for res in response.data]

### Embeddings

In [8]:
# Create embeddings in batches
BATCH_SIZE = 100  # Set batch size
vectors = []

for i in range(0, len(processed_df), BATCH_SIZE):
    batch_texts = processed_df["MY_TEXT"].iloc[i:i+BATCH_SIZE].tolist()
    batch_embeddings = get_batch_embeddings(batch_texts)
    vectors.extend(batch_embeddings)

# Add embeddings to DataFrame
processed_df[VECTOR_COL] = vectors

### Set up HANA Vector Store

In [9]:
# Connect to SAP HANA
cc = ConnectionContext(
    address=os.environ.get("HANA_ADDRESS"),
    port=os.environ.get("HANA_PORT"),
    user=os.environ.get("HANA_USER"),
    password=os.environ.get("HANA_PASSWORD"),
    encrypt=True
)
print(cc.hana_version())
print(cc.get_current_schema())

cursor = cc.connection.cursor()

4.00.000.00.1715685275 (fa/CE2024.2)
USR_336RA2ZQ5LAGTHKHCKIYB945E


In [10]:
# Create table in SAP HANA
TABLE_NAME = "NUTRITION_SCIENCE_DATA"

sql_command = f'''
CREATE TABLE {TABLE_NAME} (
    MY_TEXT NCLOB,
    MY_METADATA NCLOB,
    MY_VECTOR REAL_VECTOR
);
'''
cursor.execute(sql_command)
cursor.close()

### Add data and vectors to HANA Vector Store

In [11]:
# Prepare data for insertion
processed_df["MY_VECTOR"] = processed_df["MY_VECTOR"].apply(json.dumps) # Change to acceptable format to consider as REAL_VECTOR
data = processed_df.values.tolist()

# Define batch size
BATCH_SIZE = 100 

# Insert data into SAP HANA
cursor = cc.connection.cursor()
sql_insert = f'''
    INSERT INTO {TABLE_NAME}
    (MY_TEXT, MY_METADATA, MY_VECTOR)
    VALUES (?, ?, TO_REAL_VECTOR(?))
'''

# Process insertion in batches for better performance
total_batches = math.ceil(len(data) / BATCH_SIZE)

for i in range(total_batches):
    batch_data = data[i * BATCH_SIZE : (i + 1) * BATCH_SIZE]
    try:
        cursor.executemany(sql_insert, batch_data)
        cc.connection.commit()
        print(f"Inserted batch {i + 1}/{total_batches}")
    except Exception as e:
        print(f"Error inserting batch {i + 1}: {e}")
cursor.close()

Inserted batch 1/2
Inserted batch 2/2


### Retrieval

In [12]:
# Function to get embeddings for a query
def get_embedding(query):
    """
    Get embedding vector for a given text.
    """
    embeds = embeddings.create(
        model_name="text-embedding-ada-002",
        input=query
    )
    return embeds.data[0].embedding

In [13]:
# Function to perform vector search
def run_vector_search(query, cursor, table_name, metric="COSINE_SIMILARITY", k=4):
    """
    Performs vector search on indexed documents.
    """
    try:
        query_vector = get_embedding(query)
        if not query_vector:
            raise ValueError("Failed to generate query embedding.")

        sort_order = "DESC" if metric != "L2DISTANCE" else "ASC"
        sql_query = f'''
        SELECT TOP {k} MY_TEXT, MY_METADATA
        FROM {table_name}
        ORDER BY {metric}(MY_VECTOR, TO_REAL_VECTOR('{query_vector}')) {sort_order}
        '''
        cursor.execute(sql_query)
        return cursor.fetchall()
    except Exception as e:
        print(f"Error during vector search: {e}")
        return []


In [14]:
# Execute vector search
context = run_vector_search("How to check presence of starch in food?", cursor, TABLE_NAME, 'COSINE_SIMILARITY', 4)
for c in context:
    print(c)
    print()

(' How can you test for the presence of starch in food? By putting 2-3 drops of dilute iodine solution on the food item. A blue-black color change indicates the presence of starch. putting 2-3 drops of dilute iodine solution on the food item The search for extraterrestrial life is primarily focused on exoplanets—planets orbiting stars outside our solar system. The discovery of thousands of exoplanets, particularly those within the habitable zone of their stars, has raised hopes of finding life bey', '{"Difficulty Level": "Easy", "Category": "Nutrition"}')

(' What indicates the presence of starch in food? The presence of starch in food is indicated when a food item turns blue-black upon the iodine test. A food item turns blue-black upon the iodine test Astrobiologists use various methods to detect potential biosignatures, including spectroscopy, which analyzes the atmospheric composition of exoplanets. The presence of oxygen, methane, and other organic compounds in an exoplanet’s atmos