In [0]:
%python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession (Databricks provides SparkSession automatically)
spark = SparkSession.builder.getOrCreate()

# Load the FAO Food Balance Sheets dataset
fao_fbs_privoted_country = spark.table("workspace.postharvestloss.fao_fbs_pivoted_country")

# Define grain item codes for filtering
grain_item_codes = [
    2511,  # Wheat
    2514,  # Maize
    2555,  # Soybeans
    2805,  # Rice
    2512,  # Barley
    2515,  # Oats
    2513,  # Rye
    2807,  # Millet
    2806,  # Sorghum
    2516,  # Triticale
    2808,  # Buckwheat
    2955,  # Quinoa
    2809,  # Fonio
    2810   # Teff
]

# Define filter conditions
cond1 = col("item_code").isin(grain_item_codes)       # Filter for grains
cond2 = col("year_code") >= 2000                      # Filter for years starting from 2000

# Combine all conditions
combined_filter = cond1 & cond2

# Apply the filter
fao_fbs_filtered_grains = fao_fbs_privoted_country.filter(combined_filter)

# Display the filtered dataset
fao_fbs_filtered_grains.display(10)

In [0]:
%python
dbutils.notebook.run('/Repos/agribusiness_projects/', 0)

In [0]:
%python
import sys
from pathlib import Path

# Add the project directory to the Python path
project_dir = Path("/Repos/agribusiness_projects/PostHarvestLoss/")
sys.path.append(str(project_dir))

# Ensure the correct path to the module
from src.data.filter_fao_data import filter_fao_data

# Load FAO Food Balance Sheets dataset
fao_fbs_raw = spark.table("workspace.postharvestloss.fao_fbs_pivoted_country")

# Define grain item codes (Wheat, Maize, Soybeans, Rice)
grain_item_codes = [2511, 2514, 2555, 2805]

# Apply the filter to include grains from 2000 onwards for countries only
fao_fbs_grains_2000_onwards = filter_fao_data(
    data=fao_fbs_raw,
    item_codes=grain_item_codes,
    area_codes=[code for code in range(1, 5000)],  # Only countries
    year_range=(2000, None)  # From 2000 onwards
)

# Display the filtered dataset
display(fao_fbs_grains_2000_onwards.limit(10))

In [0]:
%python
import sys
from pathlib import Path

# Add the project directory to the Python path
project_dir = Path("/Workspace/Repos/agribusiness_projects/PostHarvestLoss/")
sys.path.append(str(project_dir))



In [0]:
%python
display(dbutils.fs.ls("dbfs:/Workspace/Repos"))