In [None]:
# Welcome to your new notebook
# Type here in the cell editor to add code!


In [2]:
import yaml
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from notebookutils import mssparkutils
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType


StatementMeta(, 66c675c7-3b26-486a-9bca-811e101f4fc3, 4, Finished, Available, Finished)

In [2]:
yaml_file_paths = [
    "abfss://test@onelake.dfs.fabric.microsoft.com/lakehouse_bronze.Lakehouse/Files/employee/employee.yaml",
    "abfss://test@onelake.dfs.fabric.microsoft.com/lakehouse_bronze.Lakehouse/Files/education_level/education_level.yaml",
    "abfss://test@onelake.dfs.fabric.microsoft.com/lakehouse_bronze.Lakehouse/Files/performance_rating/performance_rating.yaml",
    "abfss://test@onelake.dfs.fabric.microsoft.com/lakehouse_bronze.Lakehouse/Files/rating_level/rating_level.yaml",
    "abfss://test@onelake.dfs.fabric.microsoft.com/lakehouse_bronze.Lakehouse/Files/satisfied_level/satisfied_level.yaml"
]

configs = []

for yaml_file_path in yaml_file_paths:
    yaml_content = "\n".join([row["value"] for row in spark.read.format("text").load(yaml_file_path).collect()])
    config = yaml.safe_load(yaml_content)
    configs.append(config)

final_config = {"tables": []}
for config in configs:
    final_config["tables"].extend(config["tables"])



StatementMeta(, b7adeec2-226c-4197-8db4-99fb1ba8ba2e, 4, Finished, Available, Finished)

In [3]:
%run "notebook_function"

StatementMeta(, d84a560a-26e0-4487-8851-169dcbc3199f, 12, Finished, Available, Finished)

In [3]:
def create_table(config, lakehouse):
    print(f" Début de la création des tables pour {lakehouse}")

    for table_config in config["tables"]:
        table_name = table_config["name"]
        print(f"\n Traitement de la table : {table_name}")

        if lakehouse == 'lakehouse_bronze' and "bronze" in table_config:
            print(f" Vérification de l'existence de bronze : {table_config.get('bronze', None)}")

            # ✅ Utiliser `source_name` pour Bronze (garder les noms du fichier CSV)
            schema_fields = [
                StructField(col["source_name"], eval(col["type"]), col["is_nullable"] == 1)
                for col in table_config["silver"]["columns"]
            ]
            schema = StructType(schema_fields)
            empty_df = spark.createDataFrame([], schema)

            empty_df.write.format("delta").mode("overwrite").saveAsTable(table_name)
            print(f" ✅ Table Bronze créée : {table_name}")

            # ✅ Pour la table de rejet, garder le même schéma basé sur `source_name`
            reject_table_name = f"{table_name}_reject"
            empty_df.write.format("delta").mode("overwrite").saveAsTable(reject_table_name)
            print(f" 🚨 Table de rejet créée : {reject_table_name}")

        elif lakehouse == 'lakehouse_silver' and "silver" in table_config:
            print(f" Vérification de l'existence de silver : {table_config.get('silver', None)}")

            # ✅ Utiliser `target_name` pour Silver (appliquer les transformations)
            schema_fields = [
                StructField(col["target_name"], eval(col["type"]), col["is_nullable"] == 1)
                for col in table_config["silver"]["columns"]
            ]
            schema = StructType(schema_fields)
            empty_df = spark.createDataFrame([], schema)

            empty_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)
            print(f" ✅ Table Silver créée : {table_name}")

    print(f"\n Fin de la création des tables pour {lakehouse}")


StatementMeta(, b7adeec2-226c-4197-8db4-99fb1ba8ba2e, 5, Finished, Available, Finished)

In [4]:
lakehouse = "lakehouse_bronze"

create_table(final_config,lakehouse)

StatementMeta(, b7adeec2-226c-4197-8db4-99fb1ba8ba2e, 6, Finished, Available, Finished)

 Début de la création des tables pour lakehouse_bronze

 Traitement de la table : employee
 Vérification de l'existence de bronze : {'source_path': 'Files/employee/inbound/Employee.csv'}
 ✅ Table Bronze créée : employee
 🚨 Table de rejet créée : employee_reject

 Traitement de la table : education_level
 Vérification de l'existence de bronze : {'source_path': 'Files/education_level/inbound/EducationLevel.csv'}
 ✅ Table Bronze créée : education_level
 🚨 Table de rejet créée : education_level_reject

 Traitement de la table : performance_rating
 Vérification de l'existence de bronze : {'source_path': 'Files/performance_rating/inbound/PerformanceRating.csv'}
 ✅ Table Bronze créée : performance_rating
 🚨 Table de rejet créée : performance_rating_reject

 Traitement de la table : rating_level
 Vérification de l'existence de bronze : {'source_path': 'Files/rating_level/inbound/RatingLevel.csv'}
 ✅ Table Bronze créée : rating_level
 🚨 Table de rejet créée : rating_level_reject

 Traitement de

In [4]:
lakehouse ="lakehouse_silver"

create_table(final_config,lakehouse)

StatementMeta(, 91c00753-8f9b-4f2a-bd18-d448f1b854e0, 14, Finished, Available, Finished)

 Début de la création des tables pour lakehouse_silver

 Traitement de la table : employee
 Vérification de l'existence de silver : {'path': '/Tables/employee', 'columns': [{'source_name': 'EmployeeID', 'target_name': 'employee_id', 'is_nullable': 0, 'type': 'StringType()'}, {'source_name': 'FirstName', 'target_name': 'first_name', 'is_nullable': 1, 'type': 'StringType()'}, {'source_name': 'LastName', 'target_name': 'last_name', 'is_nullable': 1, 'type': 'StringType()'}, {'source_name': 'Gender', 'target_name': 'gender', 'is_nullable': 1, 'type': 'StringType()'}, {'source_name': 'Age', 'target_name': 'age', 'is_nullable': 1, 'type': 'IntegerType()'}, {'source_name': 'BusinessTravel', 'target_name': 'business_travel', 'is_nullable': 1, 'type': 'StringType()'}, {'source_name': 'Department', 'target_name': 'department', 'is_nullable': 1, 'type': 'StringType()'}, {'source_name': 'DistanceFromHome', 'target_name': 'distance_from_home', 'is_nullable': 1, 'type': 'IntegerType()'}, {'source_na

In [1]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

schema_gold_employee = StructType([
    StructField("employee_id", StringType(), False),  # PK
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("department", StringType(), True),
    StructField("job_role", StringType(), True),
    StructField("education_level", StringType(), True),  # Jointure avec silver_education_level
    StructField("salary", IntegerType(), True),
    StructField("attrition", StringType(), True),
    StructField("job_satisfaction", IntegerType(), True),  # Récupéré via silver_performance_rating
    StructField("manager_rating", IntegerType(), True),  # Récupéré via silver_performance_rating
    StructField("work_life_balance", IntegerType(), True)  # Récupéré via silver_performance_rating
])

df_gold_employee = spark.createDataFrame([], schema_gold_employee)
df_gold_employee.write.format("delta").mode("overwrite").saveAsTable("lakehouse_gold.gold_employee")

StatementMeta(, 66c675c7-3b26-486a-9bca-811e101f4fc3, 3, Finished, Available, Finished)

In [3]:
schema_gold_performance_rating = StructType([
    StructField("performance_id", StringType(), False),  # PK
    StructField("employee_id", StringType(), False),  # FK vers employee_id
    StructField("review_date", StringType(), True),
    StructField("job_role", StringType(), True),  # Jointure avec silver_employee
    StructField("salary", IntegerType(), True),  # Jointure avec silver_employee
    StructField("job_satisfaction", IntegerType(), True),
    StructField("satisfaction_level", StringType(), True),  # Jointure avec silver_satisfied_level
    StructField("self_rating", IntegerType(), True),
    StructField("rating_level", StringType(), True),  # Jointure avec silver_rating_level
    StructField("manager_rating", IntegerType(), True),
    StructField("training_opportunities_within_year", IntegerType(), True),
    StructField("work_life_balance", IntegerType(), True)
])

df_gold_performance_rating = spark.createDataFrame([], schema_gold_performance_rating)
df_gold_performance_rating.write.format("delta").mode("overwrite").saveAsTable("lakehouse_gold.gold_performance_rating")


StatementMeta(, 66c675c7-3b26-486a-9bca-811e101f4fc3, 5, Finished, Available, Finished)