In [1]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, DoubleType

# 1️⃣ читаем Bronze
df_raw = spark.read.json("Files/bronze/worldbank/gdp_usa")

# 2️⃣ описываем схему одного элемента массива
gdp_schema = ArrayType(
    StructType([
        StructField("date", StringType(), True),
        StructField("value", DoubleType(), True),
        StructField(
            "country",
            StructType([
                StructField("value", StringType(), True)
            ]),
            True
        )
    ])
)

# 3️⃣ парсим STRING → ARRAY
df_parsed = df_raw.withColumn(
    "raw_parsed",
    F.from_json(F.col("raw"), gdp_schema)
)

# 4️⃣ explode + normalisation
df_silver_gdp = (
    df_parsed
    .select(F.explode("raw_parsed").alias("row"))
    .select(
        F.col("row.date").cast("int").alias("year"),
        F.col("row.value").cast("double").alias("gdp_usd"),
        F.col("row.country.value").alias("country")
    )
    .where("gdp_usd is not null")
)

# 5️⃣ ОДИН лёгкий action
df_silver_gdp.limit(5).show()





StatementMeta(, 4f554f95-3584-4e96-8ddb-ec71f9a3d854, 3, Finished, Available, Finished)

+----+-------+-------+
|year|gdp_usd|country|
+----+-------+-------+
+----+-------+-------+



In [2]:
df_raw = spark.read.json("Files/bronze/worldbank/gdp_usa")

df_raw.printSchema()



StatementMeta(, 4f554f95-3584-4e96-8ddb-ec71f9a3d854, 4, Finished, Available, Finished)

root
 |-- raw: string (nullable = true)



In [3]:
from pyspark.sql import functions as F

df_data = df_raw.select(
    F.get_json_object("raw", "$[1]").alias("data_json")
)


StatementMeta(, 4f554f95-3584-4e96-8ddb-ec71f9a3d854, 5, Finished, Available, Finished)

In [4]:
from pyspark.sql.types import *

data_schema = ArrayType(
    StructType([
        StructField("date", StringType(), True),
        StructField("value", DoubleType(), True),
        StructField(
            "country",
            StructType([
                StructField("id", StringType(), True),
                StructField("value", StringType(), True)
            ]),
            True
        )
    ])
)


StatementMeta(, 4f554f95-3584-4e96-8ddb-ec71f9a3d854, 6, Finished, Available, Finished)

In [5]:
df_silver_gdp = (
    df_data
    .select(F.from_json("data_json", data_schema).alias("rows"))
    .select(F.explode("rows").alias("r"))
    .select(
        F.col("r.date").cast("int").alias("year"),
        F.col("r.value").alias("gdp_usd"),
        F.col("r.country.value").alias("country")
    )
    .where("gdp_usd is not null")
)


StatementMeta(, 4f554f95-3584-4e96-8ddb-ec71f9a3d854, 7, Finished, Available, Finished)

In [6]:
df_silver_gdp.limit(5).show()


StatementMeta(, 4f554f95-3584-4e96-8ddb-ec71f9a3d854, 8, Finished, Available, Finished)

+----+-------------------+-------------+
|year|            gdp_usd|      country|
+----+-------------------+-------------+
|2024|2.87509561307312E13|United States|
|2023|2.72921707932144E13|United States|
|2022| 2.5604848907611E13|United States|
|2021|     2.331508056E13|United States|
|2020|    2.1060473613E13|United States|
+----+-------------------+-------------+



In [8]:
df_dim_gdp = (
    df_silver_gdp
    .select(
        F.col("year"),
        F.col("gdp_usd"),
        F.col("country")
    )
)

df_dim_gdp.show(5)


StatementMeta(, 4f554f95-3584-4e96-8ddb-ec71f9a3d854, 10, Finished, Available, Finished)

+----+-------------------+-------------+
|year|            gdp_usd|      country|
+----+-------------------+-------------+
|2024|2.87509561307312E13|United States|
|2023|2.72921707932144E13|United States|
|2022| 2.5604848907611E13|United States|
|2021|     2.331508056E13|United States|
|2020|    2.1060473613E13|United States|
+----+-------------------+-------------+
only showing top 5 rows



In [9]:
(
    df_silver_gdp
    .write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("dbo.dim_gdp")
)


StatementMeta(, 4f554f95-3584-4e96-8ddb-ec71f9a3d854, 11, Finished, Available, Finished)

In [10]:
spark.sql("""
SELECT * FROM dbo.dim_gdp ORDER BY country, year;
""").show()

StatementMeta(, 4f554f95-3584-4e96-8ddb-ec71f9a3d854, 12, Finished, Available, Finished)

+----+-------------------+-------------+
|year|            gdp_usd|      country|
+----+-------------------+-------------+
|1960|5.41988586206897E11|United States|
|1961|5.61940310344828E11|United States|
|1962|6.03639413793103E11|United States|
|1963|6.37058551724138E11|United States|
|1964|6.84144620689655E11|United States|
|1965|7.41904862068965E11|United States|
|1966| 8.1303275862069E11|United States|
|1967|8.59620034482759E11|United States|
|1968|         9.40225E11|United States|
|1969|1.01743817241379E12|United States|
|1970|        1.073303E12|United States|
|1971|         1.16485E12|United States|
|1972|         1.27911E12|United States|
|1973|        1.425376E12|United States|
|1974|        1.545243E12|United States|
|1975|        1.684904E12|United States|
|1976|        1.873412E12|United States|
|1977|        2.081826E12|United States|
|1978|        2.351599E12|United States|
|1979|        2.627333E12|United States|
+----+-------------------+-------------+
only showing top