In [1]:
import pandas as pd
import numpy as np

silver_path = "Files/silver/openaq/nyc_2025/silver_air_with_taxi"

try:
    
    df_spark = spark.read.parquet(silver_path)
    df_silver = df_spark.toPandas()
    print("Файл успешно прочитан!")
    
    df_silver['datetime'] = pd.to_datetime(df_silver['datetime'])
    

    df_silver['taxi_zone_id'] = df_silver['taxi_zone_id'].fillna(0).astype(int)


    df_gold = (
        df_silver
        .set_index("datetime")
        .groupby(["location", "taxi_zone_id", "parameter"])
        .resample("D")["value"]
        .mean()
        .reset_index()
    )

    spark_gold = spark.createDataFrame(df_gold)
    spark_gold.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold_air_taxi_daily")

    print(f"Агрегация завершена. Теперь строк должно быть 179. Итого: {len(df_gold)}")
    display(df_gold.head())

except Exception as e:
    print(f"Ошибка: {e}")

StatementMeta(, df08abda-df0b-4120-a8f8-d30d744318d7, 3, Finished, Available, Finished)

Файл успешно прочитан!
Агрегация завершена. Теперь строк должно быть 179. Итого: 178


SynapseWidget(Synapse.DataFrame, 319e7341-2fb5-4387-8750-f26d711c9615)

In [2]:
from pyspark.sql.functions import col, coalesce, lit, to_date, mean
from pyspark.sql.types import IntegerType

df_gold = spark.read.table("gold_air_taxi_daily")
df_dimzone = spark.read.table("dim_zone")

df_gold_prepped = df_gold \
    .withColumn("date", to_date(col("datetime"))) \
    .withColumn("taxi_zone_id", col("taxi_zone_id").cast(IntegerType()))

df_joined = df_gold_prepped.join(
    df_dimzone, 
    df_gold_prepped.taxi_zone_id == df_dimzone.zone_id, 
    "left"
)

final_pivoted = (
    df_joined
    .groupBy(
        col("date"), 
        col("datetime"), 
        col("location"), 
        col("taxi_zone_id"),
        coalesce(col("borough"), lit("Unknown")).alias("Borough"),
        coalesce(col("zone_name"), lit("Private/Other")).alias("Taxi_Zone")
    )
    .pivot("parameter")
    .agg(mean("value"))
)

final_pivoted.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("gold_air_quality_final_pivoted")

print("Таблица создана! Колонки параметров развернуты, типы данных исправлены.")
display(final_pivoted.orderBy("date", "Borough"))

StatementMeta(, df08abda-df0b-4120-a8f8-d30d744318d7, 4, Finished, Available, Finished)

Таблица создана! Колонки параметров развернуты, типы данных исправлены.


SynapseWidget(Synapse.DataFrame, 001c2b8c-a2f1-43d0-913e-dff9cf55a3fd)

In [4]:
final_pivoted.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold_air_quality_final")

StatementMeta(, df08abda-df0b-4120-a8f8-d30d744318d7, 6, Finished, Available, Finished)