In [0]:
from pyspark.sql.functions import col, sum as spark_sum

# Load data from fao_fbs_pivoted_country table
fao_fbs_pivoted_country = spark.sql("""
    SELECT 
        area_code,
        area,
        item_code,
        item,
        year_code,
        year,
        production,
        import_quantity,
        export_quantity,
        losses,
        food,
        feed,
        seed,
        stock_variation
    FROM workspace.postharvestloss.fao_fbs_pivoted_country
""")

# Define filter conditions
# Condition 1: Filter only grain items (wheat, maize, soybeans, rice, barley, sorghum, oats, rye)
grain_item_codes = [
    2511,  # Wheat
    2514,  # Maize
    2555,  # Soybeans
    2805,  # Rice
    2556,  # Barley
    2807,  # Sorghum
    2516,  # Oats
    2518   # Rye
]
cond1 = fao_fbs_pivoted_country.item_code.isin(grain_item_codes)

# Condition 2: Include data from the year 2000 onwards
cond2 = col("year_code") >= 2000

# Combine filter conditions
condf = cond1 & cond2

# Apply filters
fao_fbs_filtered_grains_country = fao_fbs_pivoted_country.filter(condf)

# Aggregate values to remove the item_code dimension
fao_fbs_aggregated_grains_country = (
    fao_fbs_filtered_grains_country
    .groupBy("area_code", "area", "year_code", "year")
    .agg(
        spark_sum("production").alias("grain_production"),
        spark_sum("import_quantity").alias("grain_import_quantity"),
        spark_sum("export_quantity").alias("grain_export_quantity"),
        spark_sum("losses").alias("grain_losses"),
        spark_sum("food").alias("grain_food"),
        spark_sum("feed").alias("grain_feed"),
        spark_sum("seed").alias("grain_seed"),
        spark_sum("stock_variation").alias("grain_stock_variation")
    )
)

# Display the aggregated data
#fao_fbs_aggregated_grains_country.display(10)

In [0]:
from pyspark.sql.functions import col, sum as spark_sum

# Load data from fao_fbs_pivoted_region table
fao_fbs_pivoted_region = spark.sql("""
    SELECT 
        area_code,
        area,
        item_code,
        item,
        year_code,
        year,
        production,
        import_quantity,
        export_quantity,
        losses,
        food,
        feed,
        seed,
        stock_variation
    FROM workspace.postharvestloss.fao_fbs_pivoted_region
""")

# Define filter conditions
# cond1: Filter only grain items (wheat, maize, soybeans, rice, barley, sorghum, oats, rye)
grain_item_codes = [
    2511,  # Wheat
    2514,  # Maize
    2555,  # Soybeans
    2805,  # Rice
    2556,  # Barley
    2807,  # Sorghum
    2516,  # Oats
    2518   # Rye
]
cond1 = fao_fbs_pivoted_region.item_code.isin(grain_item_codes)

# cond2: Include data from the year 2000 onwards
cond2 = col("year_code") >= 2000

# condf: Combine filter conditions
condf = cond1 & cond2

# Apply filters
fao_fbs_filtered_grains_region = fao_fbs_pivoted_region.filter(condf)

# Aggregate values to remove the item_code dimension and rename with 'grain_' prefix
fao_fbs_aggregated_grains_region = (
    fao_fbs_filtered_grains_region
    .groupBy("area_code", "area", "year_code", "year")
    .agg(
        spark_sum("production").alias("grain_production"),
        spark_sum("import_quantity").alias("grain_import_quantity"),
        spark_sum("export_quantity").alias("grain_export_quantity"),
        spark_sum("losses").alias("grain_losses"),
        spark_sum("food").alias("grain_food"),
        spark_sum("feed").alias("grain_feed"),
        spark_sum("seed").alias("grain_seed"),
        spark_sum("stock_variation").alias("grain_stock_variation")
    )
)

# Display the aggregated data in Databricks
#display(fao_fbs_aggregated_grains_region)

In [0]:
fao_fbs_aggregated_grains_region.select("area_code", "area").dropDuplicates().display()