In [61]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [4]:
df_sites = pd.read_csv("/work/tadesse/beichen/ForDRI/Data/All_AmeriFlex_Data/AmeriFlux_Public_Site_Info.tsv",delimiter='\t')

In [34]:
# we need to know the column information
print(df_sites.columns.values)

['Site ID' 'Name' 'Principal Investigator' 'Data Use Policy'
 'AmeriFlux BASE Data' 'AmeriFlux FLUXNET Data'
 'Vegetation Abbreviation (IGBP)' 'Vegetation Description (IGBP)'
 'Climate Class Abbreviation (Koeppen)'
 'Climate Class Description (Koeppen)' 'Mean Average Precipitation (mm)'
 'Mean Average Temperature (degrees C)' 'Country' 'Latitude (degrees)'
 'Longitude (degrees)' 'Elevation (m)' 'Years of AmeriFlux BASE Data'
 'AmeriFlux BASE DOI' 'Years of AmeriFlux FLUXNET Data'
 'AmeriFlux FLUXNET DOI' 'Site Start' 'Site End'
 'BASE variables available' 'FLUXNET variables available' 'State']


In [95]:
# another necessary information
df_sites.loc[:,"Vegetation Abbreviation (IGBP)"].unique()

array(['CRO', 'GRA', 'WET', 'DNF', 'WSA', 'EBF', 'MF', 'ENF', 'DBF',
       'OSH', 'WAT', 'CSH', 'URB', 'BSV', 'CVM', 'SAV', 'SNO'],
      dtype=object)

In [148]:
# another necessary information
df_sites.loc[:,"Vegetation Description (IGBP)"].unique()
# we might want:
# Deciduous Needleleaf Forests (DNF)
# Woody Savannas (WSA)
# Evergreen Broadleaf Forests (EBF)
# Mixed Forests (MF)
# Evergreen Needleleaf Forests (ENF)
# Deciduous Broadleaf Forests (DBF)
# Open Shrublands (OSH)
# Closed Shrublands (CSH)

array(['Croplands: Lands covered with temporary crops followed by harvest and a bare soil period (e.g., single and multiple cropping systems). Note that perennial woody crops will be classified as the appropriate forest or shrub land cover type.',
       'Grasslands: Lands with herbaceous types of cover. Tree and shrub cover is less than 10%. Permanent wetlands lands with a permanent mixture of water and herbaceous or woody vegetation. The vegetation can be present in either salt, brackish, or fresh water.',
       'Permanent Wetlands: Lands with a permanent mixture of water and herbaceous or woody vegetation that cover extensive areas. The vegetation can be present in either salt, brackish, or fresh water',
       'Deciduous Needleleaf Forests: Lands dominated by woody vegetation with a percent cover >60% and height exceeding 2 meters.  Consists of seasonal needleleaf tree communities with an annual cycle of leaf-on and leaf-off periods.',
       'Woody Savannas: Lands with herbaceous

In [30]:
shp = gpd.read_file("/work/tadesse/beichen/ForDRI/Data/Shapefiles/tl_2020_us_state/tl_2020_us_state.shp")

In [31]:
state_name = shp.NAME.values

In [32]:
df_sites["State"] = "NotInUSA"
for idx, r in df_sites.iterrows():
    p = Point(r.loc["Longitude (degrees)"],r.loc["Latitude (degrees)"])
    state_arr = p.within(shp.geometry).values
    if state_arr.sum() == 1:
        sname = state_name[state_arr]
        df_sites.loc[idx,"State"] = sname
    else:
        continue

In [149]:
# do a filter for each state
# sites in the us and has the records after 2003 and till now.
df_sites_filtered = pd.DataFrame(columns=df_sites.columns,index=state_name)
for s in state_name:
    df_sites_state = df_sites.loc[(df_sites.State==s)&
                                  (df_sites.loc[:,"AmeriFlux BASE Data"]=="Yes")&
                                  (df_sites.loc[:,"Data Use Policy"]=="CC-BY-4.0"),:] # we need Aermix Flux Base Data
    
    # first rule
    # the site should be in a forest area
    # Vegetation Description (IGBP) includes Forests
    if df_sites_state.shape[0] > 1:
        df_sites_state_forest = df_sites_state.loc[[r in ["DNF","WSA","EBF","MF","ENF","DBF","OSH","CSH"] \
                                                    for c in df_sites_state.loc[:,"Vegetation Abbreviation (IGBP)"] \
                                                    for r in c.split(" ")],:]
    else:
        df_sites_state_forest = df_sites_state.loc[[c in ["DNF","WSA","EBF","MF","ENF","DBF","OSH","CSH"] \
                                                    for c in df_sites_state.loc[:,"Vegetation Abbreviation (IGBP)"]],:]
    if df_sites_state_forest.shape[0] != 0:
        # second rule
        # Years of AmeriFlux BASE Data, last year should be larger than 2003
        # counting the number of years
        year_count = [sum(np.asarray(list(map(int, r)))>2003) for r in df_sites_state_forest.loc[:,"Years of AmeriFlux BASE Data"].str.split(', ')]
        df_sites_filtered.loc[s,:]=df_sites_state_forest.iloc[np.argsort(year_count)[-1],:].values


In [150]:
df_US_forest_AmeriFlux = df_sites_filtered.dropna(subset="Site ID",axis=0).sort_index()

In [151]:
df_US_forest_AmeriFlux.to_csv("/work/tadesse/beichen/ForDRI/Data/New_Bowen_Ratio/Filtered_AmeriFlux_Sites.csv")