## Read CH4 Emissions from NEI2020 Inventory

Yunha Lee (Aug 12, 2024)

* I used the **"SMOKE FLAT"** data downloaded from the following link:  
  [SMOKE FLAT Data (POINT_20230330.zip)](https://gaftp.epa.gov/air/nei/2020/doc/flat_files/SmokeFlatFile_POINT_20230330.zip)

* The output file includes the following variables:
    - **FIPS**
    - **SCC**   (source category code)
    - **EIS_ID**  (facility information)
    - **latitude**  (facility location)
    - **longitude**  (facility location)
    - **CH4**  (annual CH4 emissions)
    - **height**  (Stack info)
    - **diam**   (Stack info)
    - **temp**   (Stack info)
    - **velocity**   (Stack info)
    - **coords**   (facility location as point geometry)


In [17]:

import pandas as pd
from shapely.geometry import Point
import geopandas as gpd
import numpy as np
from collections import defaultdict

# Suppress all warnings in jupyter notebook
import warnings
warnings.filterwarnings('ignore')

# Function to safely convert a value to float and round it 
def safe_float_conversion(value, scale=1.0, offset=0.0, default=np.nan, round_digits=None):
    try:
        if pd.isna(value):  # Check if the value is NaN or missing
            return default
        result = (float(value) * scale) + offset
        if round_digits is not None:
            result = round(result, round_digits)
        return result
    except (ValueError, TypeError):
        return default

In [31]:

file_path = '/Users/yunhalee/Documents/methanDart/Gaussian_Puff_CH4/NEI_CH4/SMOKE_flat/2020NEI_point_full_20230330.csv'
output_dir = '/Users/yunhalee/Documents/methanDart/Gaussian_Puff_CH4/NEI_CH4/final/'

emis_df = pd.read_csv(file_path, comment='#', dtype={'REGION_CD': str})

# Convert the DataFrame's column names to uppercase
emis_df.columns = emis_df.columns.str.lower()

emis_df.columns


Index(['country_cd', 'region_cd', 'tribal_code', 'facility_id', 'unit_id',
       'rel_point_id', 'process_id', 'agy_facility_id', 'agy_unit_id',
       'agy_rel_point_id', 'agy_process_id', 'scc', 'poll', 'ann_value',
       'ann_pct_red', 'facility_name', 'erptype', 'stkhgt', 'stkdiam',
       'stktemp', 'stkflow', 'stkvel', 'naics', 'longitude', 'latitude',
       'll_datum', 'horiz_coll_mthd', 'design_capacity',
       'design_capacity_units', 'reg_codes', 'fac_source_type',
       'unit_type_code', 'control_ids', 'control_measures', 'current_cost',
       'cumulative_cost', 'projection_factor', 'submitter_id', 'calc_method',
       'data_set_id', 'facil_category_code', 'oris_facility_code',
       'oris_boiler_id', 'ipm_yn', 'calc_year', 'date_updated', 'fug_height',
       'fug_width_xdim', 'fug_length_ydim', 'fug_angle', 'zipcode',
       'annual_avg_hours_per_year', 'jan_value', 'feb_value', 'mar_value',
       'apr_value', 'may_value', 'jun_value', 'jul_value', 'aug_value',
  

In [32]:
emis_df = emis_df[['region_cd', 'scc', 'poll', 'ann_value', 
                'stkhgt', 'stkdiam', 'stktemp', 'stkvel',
                'facility_id', 
                'latitude', 'longitude']]

print("emis_df length", emis_df.shape)
emis_df.columns


emis_df length (9554377, 11)


Index(['region_cd', 'scc', 'poll', 'ann_value', 'stkhgt', 'stkdiam', 'stktemp',
       'stkvel', 'facility_id', 'latitude', 'longitude'],
      dtype='object')

In [33]:
# Subset the emis_df for CH4 only
ch4_df = emis_df[emis_df['poll'] == "CH4"]
print("ch4_df length", ch4_df.shape)

ch4_df.head()

ch4_df length (58255, 11)


Unnamed: 0,region_cd,scc,poll,ann_value,stkhgt,stkdiam,stktemp,stkvel,facility_id,latitude,longitude
474,34013,28500201.0,CH4,0.183507,,,,,17778311,40.684639,-74.16325
491,34023,28500201.0,CH4,0.01963,,,,,17777511,40.528006,-74.36985
501,34039,28500201.0,CH4,0.037487,,,,,17777711,40.661661,-74.192378
506,34017,28500201.0,CH4,0.023984,,,,,17777811,40.745156,-74.102222
518,34013,28500201.0,CH4,0.481307,,,,,17780211,40.695328,-74.156914


In [34]:
ch4_df['CH4'] = ch4_df['ann_value']
ch4_df['height'] = ch4_df['stkhgt'].apply(safe_float_conversion, args=(0.3048, 0, np.nan, 5))
ch4_df['diam'] = ch4_df['stkdiam'].apply(safe_float_conversion, args=(0.3048, 0, np.nan, 5))
ch4_df['temp'] = ch4_df['stktemp'].apply(safe_float_conversion, args=(5.0/9.0, 273.15 - (32 * 5.0/9.0), np.nan, 5))
ch4_df['velocity'] = ch4_df['stkvel'].apply(safe_float_conversion, args=(0.3048, 0, np.nan, 5))
ch4_df['coords'] = ch4_df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

# Remove the 'poll' and 'ann_value' columns
ch4_df = ch4_df.drop(columns=['poll', 'ann_value', 'stkhgt', 'stkdiam', 'stktemp', 'stkvel'])

ch4_df.rename(columns={'region_cd': 'FIPS','facility_id': 'EIS_ID', 'scc': 'SCC'}, inplace = True)

ch4_df.head()

Unnamed: 0,FIPS,SCC,EIS_ID,latitude,longitude,CH4,height,diam,temp,velocity,coords
474,34013,28500201.0,17778311,40.684639,-74.16325,0.183507,,,,,POINT (-74.16325 40.684639)
491,34023,28500201.0,17777511,40.528006,-74.36985,0.01963,,,,,POINT (-74.36985 40.528006)
501,34039,28500201.0,17777711,40.661661,-74.192378,0.037487,,,,,POINT (-74.192378 40.661661)
506,34017,28500201.0,17777811,40.745156,-74.102222,0.023984,,,,,POINT (-74.102222 40.745156)
518,34013,28500201.0,17780211,40.695328,-74.156914,0.481307,,,,,POINT (-74.156914 40.695328)


In [36]:
ch4_df['SCC'] = ch4_df['SCC'].astype(int)

final_gdf = gpd.GeoDataFrame(ch4_df, geometry='coords', crs='epsg:4269')

output_file = output_dir + 'NEI2020_CH4_Point_source.shp'
final_gdf.to_file(output_file)
print(f"{file_path} is saved as shapefile here: {output_file}")

/Users/yunhalee/Documents/methanDart/NEI_CH4/SMOKE_flat/2020NEI_point_full_20230330.csv is saved as shapefile here: /Users/yunhalee/Documents/methanDart/NEI_CH4/final/NEI2020_CH4_Point_source.shp
