# Greenhouse Gas Emission Analysis - Interactive Notebook

This notebook provides an interactive way to explore the greenhouse gas emission data using PySpark.

## 1. Setup and Imports

In [None]:
import sys
sys.path.insert(0, '..')

from src.data_loader import create_spark_session, load_csv_data, preprocess_co2e_data, preprocess_by_ghg_data
from src.analysis import (
    analyze_top_emitters, 
    analyze_by_sector, 
    compare_margin_impact,
    analyze_ghg_types,
    find_high_emission_industries,
    analyze_emission_distribution
)
from src.utils import show_data_quality_report
from config.config import data_path, app_name

## 2. Create Spark Session

In [None]:
spark = create_spark_session(app_name)

## 3. Load and Preprocess Data

In [None]:
# Load CO2e data
co2e_df_raw = load_csv_data(spark, data_path['co2e'])
co2e_df = preprocess_co2e_data(co2e_df_raw)

# Load individual GHG data
ghg_df_raw = load_csv_data(spark, data_path['by_ghg'])
ghg_df = preprocess_by_ghg_data(ghg_df_raw)

## 4. Data Quality Check

In [None]:
show_data_quality_report(co2e_df)

## 5. View Sample Data

In [None]:
co2e_df.show(10, truncate=False)

## 6. Top Emitters Analysis

In [None]:
top_emitters = analyze_top_emitters(co2e_df, n=20)
top_emitters.show(20, truncate=False)

## 7. Sector-Level Analysis

In [None]:
sector_analysis = analyze_by_sector(co2e_df)
sector_analysis.show(20, truncate=False)

## 8. Margin Impact Analysis

In [None]:
margin_impact = compare_margin_impact(co2e_df)
margin_impact.show(15, truncate=False)

## 9. High Emission Industries

In [None]:
high_emitters = find_high_emission_industries(co2e_df, threshold=1.0)
high_emitters.show(20, truncate=False)

## 10. Emission Distribution

In [None]:
distribution = analyze_emission_distribution(co2e_df)
distribution.show(truncate=False)

## 11. GHG Type Analysis

In [None]:
ghg_analysis = analyze_ghg_types(ghg_df)
ghg_analysis.show(25, truncate=False)

## 12. Custom Analysis

You can perform custom queries using PySpark SQL functions:

In [None]:
# Example: Filter industries with emissions > 2.0
from pyspark.sql.functions import col

high_emissions = co2e_df.filter(
    col("Supply_Chain_Emission_Factors_with_Margins") > 2.0
).select(
    "NAICS_Code",
    col("2017_NAICS_Title").alias("Industry"),
    col("Supply_Chain_Emission_Factors_with_Margins").alias("Emissions")
).orderBy(col("Emissions").desc())

high_emissions.show(truncate=False)

## 13. Stop Spark Session

In [None]:
spark.stop()