# Gold: Création du modèle dimensionnel

### **Dimensions :**
1. **dim_date** : Informations calendaires
2. **dim_time** : Informations horaires
3. **dim_turbine** : Caractéristiques des turbines
4. **dim_operational_status** : Statuts opérationnels

### **Fait :**
- **fact_wind_power** : Mesures de productio

In [1]:
import pyspark.sql.functions as sf

df_silver = spark.read.load("abfss://79daad76-9b51-4f79-b1a3-5049702b5055@onelake.dfs.fabric.microsoft.com/f584c5bb-e8c1-42fe-ae55-db6de3ba417b/Tables/dbo/wind_power")

print(df_silver.dtypes)
display(df_silver.orderBy(sf.rand()).limit(5))

StatementMeta(, 63b38421-2c44-416a-8d11-92d938ba1e83, 3, Finished, Available, Finished, False)

[('production_id', 'bigint'), ('date', 'timestamp'), ('time', 'string'), ('turbine_name', 'string'), ('capacity', 'bigint'), ('location_name', 'string'), ('latitude', 'double'), ('longitude', 'double'), ('region', 'string'), ('status', 'string'), ('responsible_department', 'string'), ('wind_speed', 'double'), ('wind_direction', 'string'), ('energy_produced', 'double'), ('day', 'int'), ('month', 'int'), ('quarter', 'int'), ('year', 'int'), ('hour_of_day', 'string'), ('minute_of_hour', 'string'), ('second_of_minute', 'string'), ('time_period', 'string')]


SynapseWidget(Synapse.DataFrame, 22bde030-e2de-46e2-87fd-0309be44e569)

In [2]:
# dim_time
dim_time_df = df_silver.select(
    "time", 
    "hour_of_day", 
    "minute_of_hour", 
    "second_of_minute", 
    "time_period"
).distinct().withColumnRenamed("time", "time_id")

display(dim_time_df.orderBy(sf.rand()).limit(5))

StatementMeta(, 63b38421-2c44-416a-8d11-92d938ba1e83, 4, Finished, Available, Finished, False)

SynapseWidget(Synapse.DataFrame, b821158a-c5a8-49bd-8d7f-75d9f2aa61c3)

In [3]:
# dim_date
dim_date_df = df_silver.select(
    "date",
    "day",
    "month", 
    "quarter",
    "year"
).distinct().withColumnRenamed("date", "date_id")

display(dim_date_df.orderBy(sf.rand()).limit(5))

StatementMeta(, 63b38421-2c44-416a-8d11-92d938ba1e83, 5, Finished, Available, Finished, False)

SynapseWidget(Synapse.DataFrame, 5835c9e2-4e90-4ca0-8007-339a753aaa16)

In [4]:
from pyspark.sql.window import Window 

# dim_turbine

dim_turbine_df = df_silver.select(
    "turbine_name", 
    "capacity",
    "location_name",
    "latitude",
    "longitude",
    "region"
).distinct()

dim_turbine_df = dim_turbine_df.withColumn(
    "turbine_id", 
    sf.row_number().over(Window.orderBy(
        "turbine_name", 
        "capacity", 
        "location_name"
    ))
)

display(dim_turbine_df.limit(5))

StatementMeta(, 63b38421-2c44-416a-8d11-92d938ba1e83, 6, Finished, Available, Finished, False)

SynapseWidget(Synapse.DataFrame, bc87ab6a-2dff-49be-bc50-f23dc3f147db)

In [5]:
# dim_operational_status
dim_operational_status_df = (df_silver
    .select(
        "status", 
        "responsible_department"
    ).distinct()
    .withColumn(
        "status_id", 
        sf.row_number().over(Window.orderBy(
            "status", 
            "responsible_department"
            )
        )
    )
)

display(dim_operational_status_df.limit(5))


StatementMeta(, 63b38421-2c44-416a-8d11-92d938ba1e83, 7, Finished, Available, Finished, False)

SynapseWidget(Synapse.DataFrame, 0b31da5c-2ba1-4fd7-886b-8d5ec1267b65)

In [6]:
fact_wind_power_production_df = (df_silver
    .join(dim_operational_status_df, on=["status", "responsible_department"], how="left")
    .join(dim_turbine_df, on=["turbine_name", "capacity", "location_name"], how="left")
    .select(
        "production_id",
        "status_id",
        "turbine_id",
        sf.col("date").alias("date_id"),
        sf.col("time").alias("time_id"),
        "wind_speed",
        "wind_direction",
        "energy_produced"
    ).withColumnRenamed("date", "date_id")
    .withColumnRenamed("time", "time_id")
)

display(fact_wind_power_production_df.limit(5))

StatementMeta(, 63b38421-2c44-416a-8d11-92d938ba1e83, 8, Finished, Available, Finished, False)

SynapseWidget(Synapse.DataFrame, 26d4dd34-8b7f-4453-ad7e-036e7ccc1594)

In [7]:
tables = {
    "dim_date": dim_date_df,
    "dim_time": dim_time_df,
    "dim_turbine": dim_turbine_df,
    "dim_operational_status": dim_operational_status_df,
    "fact_wind_power": fact_wind_power_production_df
}

gold_base_path = "abfss://79daad76-9b51-4f79-b1a3-5049702b5055@onelake.dfs.fabric.microsoft.com/a4e2a5aa-4da7-4842-b8e2-79d33bfd5590/Tables/dbo/"

for table, df in tables.items():
    df.write.format("delta").mode("overwrite").save(f"{gold_base_path}{table}")


StatementMeta(, 63b38421-2c44-416a-8d11-92d938ba1e83, 9, Finished, Available, Finished, False)