In [0]:
import re
import json
from pyspark.sql.functions import udf, col, concat_ws, lit, when
from pyspark.sql.types import StringType
from delta.tables import DeltaTable
from pyspark.sql import functions as F


S3_RAW_PATH = "s3://cvee-bucket-eu-north-1/jobs_metadata_raw/"
DELTA_TABLE_PATH = "cvee.job_metadata_silver"

# User defined functions

def clean_html(text):
    if text is None:
        return ""
    clean = re.compile('<.*?>|&([a-zA-Z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});')
    text = re.sub(clean, '', text)
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    text = re.sub(' +', ' ', text).strip()
    return text


clean_html_udf = udf(clean_html, StringType())


print(f"Reading data from S3 {S3_RAW_PATH} ")
df_raw = spark.read.parquet(S3_RAW_PATH)
print(f"Number of records read: {df_raw.count()}")
df_raw.printSchema()


In [0]:
print("Stage 2: Transform data")
df_cleaned = df_raw \
    .withColumn("description_clean", clean_html_udf(col("description"))) \
    .withColumn("competences_aggregated", 
                F.expr("concat_ws(' ', transform(competences, x -> x.libelle))")) \
    .withColumn("formations_aggregated", 
                F.expr("concat_ws(' ', transform(formations, x -> x.domaineLibelle))")) \
    .withColumn("qualites_aggregated", 
                F.expr("concat_ws(' ', transform(qualitesProfessionnelles, x -> x.libelle))")) \
    .withColumn("lieu_aggregated", aggregate_data_udf(col("lieuTravail"), lit("libelle"))) \
    .withColumn("lieu_aggregated", col("lieuTravail.libelle"))

df_final = df_cleaned.withColumn(
    "vector_text_input",
    concat_ws(
        " ",
        col("intitule"),
        col("description_clean"),
        col("competences_aggregated"),
        col("formations_aggregated"),
        col("qualites_aggregated")
    )
)

df_final = df_final.dropDuplicates(["id"])

df_final = df_final.select(col("*"))

df_final = df_final.withColumnRenamed("id", "job_id") \
                   .withColumnRenamed("intitule", "job_title") \
                   .withColumnRenamed("lieu_aggregated", "job_location") \
                   .withColumnRenamed("competences_aggregated", "required_skills") \
                   .withColumnRenamed("formations_aggregated", "required_formation") \
                   .withColumnRenamed("qualites_aggregated", "required_qualities") \


print("Stage 3: Translation with ai_translate")

df_translated = df_final.withColumn(
    "vector_text_input_en",
    F.expr("ai_translate(vector_text_input, 'en')")
).drop("vector_text_input") \
    .withColumnRenamed("vector_text_input_en", "vector_text_input")

In [0]:
print(f"Merging into Delta table {DELTA_TABLE_PATH}")
try:
    delta_table = DeltaTable.forName(spark, DELTA_TABLE_PATH)
    old_count = delta_table.toDF().count()

    delta_table.alias("target").merge(
        df_translated.alias("source"),
        "target.job_id = source.job_id"
    ).whenNotMatchedInsertAll().execute()

    new_count = delta_table.toDF().count()
    print(f"Number of rows added: {new_count - old_count} / {df_translated.count()} read")
except:
    df_translated.write.format("delta").mode("overwrite").saveAsTable(DELTA_TABLE_PATH)
    print(f"Table created with {df_translated.count()} rows")

In [0]:
%sql
DESCRIBE TABLE cvee.job_metadata_silver