In [0]:
import re
import json
from pyspark.sql.functions import udf, col, concat_ws, lit, when
from pyspark.sql.types import StringType


S3_RAW_PATH = "s3://cvee-bucket-eu-north-1/jobs_metadata_raw/jobs_metadata_raw_20251204_160713.csv"
CSV_OUTPUT_PATH = "/Volumes/workspace/cvee/raw_to_silver_etl/jobs_metadata_silver"
TARGET_TABLE = "cvee.job_metadata_silver"

# User defined functions

def clean_html(text):
    if text is None:
        return ""
    clean = re.compile('<.*?>|&([a-zA-Z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});')
    text = re.sub(clean, '', text)
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    text = re.sub(' +', ' ', text).strip()
    return text

def extract_and_aggregate_struct_data(struct_list, key_to_extract):
    if struct_list is None:
        return ""
    try:
        data = json.loads(struct_list) if isinstance(struct_list, str) else struct_list
        if isinstance(data, dict):
            return str(data.get(key_to_extract, ""))
        if isinstance(data, list):
            labels = [item.get(key_to_extract, "") for item in data if isinstance(item, dict)]
            return " ".join(labels).strip()
        return ""
    except Exception:
        return ""

clean_html_udf = udf(clean_html, StringType())
aggregate_data_udf = udf(extract_and_aggregate_struct_data, StringType())


def run_spark_etl():
    print(f"Reading data from S3 {S3_RAW_PATH} ---")
    df_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(S3_RAW_PATH)
    print(f"Number of records read: {df_raw.count()}")
    df_raw.printSchema()

    print("Stage 2: Transform data")
    df_cleaned = df_raw \
        .withColumn("description_clean", clean_html_udf(col("description"))) \
        .withColumn("competences_aggregated", aggregate_data_udf(col("competences"), lit("libelle"))) \
        .withColumn("formations_aggregated", aggregate_data_udf(col("formations"), lit("domaineLibelle"))) \
        .withColumn("qualites_aggregated", aggregate_data_udf(col("qualitesprofessionnelles"), lit("libelle"))) \
        .withColumn("city_label", aggregate_data_udf(col("lieutravail"), lit("libelle")))

    df_final = df_cleaned.withColumn(
        "vector_text_input",
        concat_ws(
            " ",
            col("intitule"),
            col("description_clean"),
            col("competences_aggregated"),
            col("formations_aggregated"),
            col("qualites_aggregated")
        )
    )

    df_final = df_final.select(
        col("id").alias("job_id"),
        col("intitule").alias("job_title"),
        col("city_label").alias("job_location"),
        "vector_text_input",
        col("description_clean"),
        col("typecontratlibelle").alias("contract_type_label"),
        col("experiencelibelle").alias("experience_level_label"),
        col("romecode").alias("rome_code"),
        col("secteuractivitelibelle").alias("sector_label"),
        col("datecreation").alias("created_at"),
        when(col("alternance") == "False", lit(0)).otherwise(lit(1)).alias("is_alternance"),
        when(col("accessibleth") == "True", lit(1)).otherwise(lit(0)).alias("is_accessible_th")
    )

    print(f"Load {TARGET_TABLE} ---")
    df_final.write \
        .mode("overwrite") \
        .saveAsTable(TARGET_TABLE)

    spark.sql( f"""
        SELECT job_id, job_title, SUBSTRING(vector_text_input, 1, 120) AS preview
        FROM {TARGET_TABLE}
        LIMIT 5
    """).show(truncate=False)

    print(f"--- Writing CSV output to {CSV_OUTPUT_PATH} ---")
    df_final.write.format("csv") \
        .option("header", "true") \
        .mode("overwrite") \
        .save(CSV_OUTPUT_PATH)

if __name__ == "__main__":
    run_spark_etl()
