In [0]:
%pip install sentence-transformers==2.2.2 torch huggingface_hub==0.24.0 --quiet

In [0]:
import os
import gc
import torch
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import pandas_udf
from delta.tables import DeltaTable
from typing import Iterator
import traceback

SILVER_TABLE = "cvee.jobs_silver"
GOLD_TABLE = "cvee.jobs_gold"
TEMP_TABLE_NAME = "pyspark_tmp_embeddings"
OUTPUT_SCHEMA = "job_id string, ingestion_date string, embedding array<float>"

In [0]:

@pandas_udf("array<float>")
def get_embeddings_iterator(batch_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
    """
    Pandas UDF that generates text embeddings for batches of input strings using a SentenceTransformer model.
    """
    
    os.environ['USER'] = 'ubuntu'
    os.environ['HOME'] = '/tmp'
    os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
    
    from sentence_transformers import SentenceTransformer
    
    # CPU Optimization
    torch.set_num_threads(1)
    
    # Model loading
    model = SentenceTransformer("BAAI/bge-small-en", device="cpu")
    for batch in batch_iter:
        texts = batch.fillna("").str.slice(0, 5000).tolist()
        
        with torch.no_grad():
            embeddings = model.encode(
                texts, 
                batch_size=16, 
                show_progress_bar=False, 
                convert_to_numpy=True
            )
            
        yield pd.Series(embeddings.tolist())
        
        # Memory cleanup
        del texts
        del embeddings
        gc.collect()


In [0]:
# Data preparation
df_silver = spark.table(SILVER_TABLE).select("job_id", "vector_text_input", "ingestion_date")
df_gold = spark.table(GOLD_TABLE).select("job_id")
df_to_process = df_silver.join(df_gold, on="job_id", how="left_anti")

df_prepared = df_to_process.select(
    "job_id", 
    "ingestion_date", 
    F.coalesce(F.col("vector_text_input"), F.lit("")).alias("text_input")
).repartition(100)

display(f"{df_to_process.count()} new offers")


In [0]:
# Execution

#Applying UDF
df_final = df_prepared.withColumn(
    "embedding", 
    get_embeddings_iterator(F.col("text_input"))
).select("job_id", "ingestion_date", "embedding")

print("Generating vectors...")


df_final.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(TEMP_TABLE_NAME)

print(f"Embeddings successfully saved to {TEMP_TABLE_NAME}")

In [0]:
print(f"Merging data from {TEMP_TABLE_NAME} to {GOLD_TABLE}...")

try:
    gold_table = DeltaTable.forName(spark, GOLD_TABLE)
    df_temp = spark.table(TEMP_TABLE_NAME)
    gold_table.alias("target") \
        .merge(
            df_temp.alias("source"),
            "target.job_id = source.job_id"
        ) \
        .whenNotMatchedInsertAll() \
        .execute()

    print("Merge completed successfully.")
except Exception as e:
    print("\n Merge failed")
    print(f"Error Type : {type(e).__name__}")
    print(f"Error Message : {e}")
    print("\nStacktrace")
    traceback.print_exc()