<a href="https://colab.research.google.com/github/sciencebyAJ/watrs_ec_processing/blob/main/WATRS_COMBINE_EC_DATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSUMB EC PROCESSING STEP 1
1.   Combine raw .dat files from towers

Files exported from the eddy pro software are saved in the Ameriflux Standard output format as '.dat' files.  These files are capped at a certain size and need to be combined to generate a record comensurate with the entire period of observation.

The script below reads in all 'raw' data for a given tower, *combines the data* into a large dataframe, *assigns a time index*, *removes duplicate rows* due to 'overlap' between '.dat' file records, *converts data values to numeric* from objects, and *saves the data to a csv* for evaluating quality, applying PI quality control flags, and gap-filling.


In [None]:
!git clone https://github.com/sciencebyAJ/watrs_ec_processing.git

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Dependencies

In [2]:
import glob
import pandas as pd
import datetime
import os
import numpy as np
import json
import requests
from tables import NaturalNameWarning
import warnings
warnings.filterwarnings('ignore', category=NaturalNameWarning)
verbose = False

In [None]:
cd oet_gf_ti/
import src.combine_ec_data as combo


### Set File Paths

In [20]:
# Preserve for future use
# meta_url = 'https://docs.google.com/spreadsheets/d/1cUHT0Rb0n39I0qk-bYY194spSWr7MNqkFX15PWnxXlI/edit?usp=sharing'
# read_url = meta_url.replace('/edit?usp=sharing','/export?format=csv&gid=0')
# meta_df = pd.read_csv(read_url)
# covert to table above long term, use these for now
# 2024-present fields

In [14]:
LOGGERNET=False #<-- flag to address table format of Loggernet Pulled Tables vs Manually Pulled Tables

lab_folder_path = '/content/drive/Shareddrives/WATRS_Field_Data/Field_Data/'
raw_data_path = lab_folder_path+'CSUMB_WineGrape_Zabala_2023/' # will draw from metadata table

field_name= combo.get_field_name(raw_data_path)
# out paths
out_path = raw_data_path+'Combined_EC_Data/'
combo.check_folder(raw_data_path+'Data',out_path)


/content/drive/Shareddrives/WATRS_Field_Data/Field_Data/CSUMB_WineGrape_Zabala_2023/Data folder exists...
	can not locate /content/drive/Shareddrives/WATRS_Field_Data/Field_Data/CSUMB_WineGrape_Zabala_2023/Combined_EC_Data/ ...
	making directory: /content/drive/Shareddrives/WATRS_Field_Data/Field_Data/CSUMB_WineGrape_Zabala_2023/Combined_EC_Data/ ... 
	directory /content/drive/Shareddrives/WATRS_Field_Data/Field_Data/CSUMB_WineGrape_Zabala_2023/Combined_EC_Data/ created


### Get list of filenames for '.dat' Ameriflux formatted files

In [11]:
fnames = glob.glob(raw_data_path+'Data/*/1*AmeriFluxForma*.dat')

if len(fnames)==0:
  print('Check to see if data exists at:\n'+raw_data_path)
  print('Re-define raw-data path variable above')
else:
  print(f'There are {str(len(fnames))} files in the folder')
fnames.sort(key=os.path.getmtime)

i = 0
for fname in fnames:
  if i == 0:
    df_i = pd.read_csv(fname)
    col_list = list(df_i.columns)
    df_all = df_i.copy()
  else:
    try:
      if LOGGERNET == True:
        df_i = pd.read_csv(LOGGERNETfname,header=1)
      else:
        df_i = pd.read_csv(fname)
      df_i = combo.check_for_missing_columns(df_i,col_list)
      df_all=pd.concat([df_all,df_i[col_list]],axis=0,ignore_index=True)
      df_all.drop_duplicates(subset=['TIMESTAMP_END'],inplace=True)
      if verbose == True:
        print('x '+fname)
    except:
      try:
        df_i = pd.read_csv(fname, encoding= 'unicode_escape',on_bad_lines='skip')
        df_i = combo.check_for_missing_columns(df_i,col_list)
        df_all=pd.concat([df_all,df_i[col_list]],axis=0,ignore_index=True)
        df_all.drop_duplicates(subset=['TIMESTAMP_END'],inplace=True)
        if verbose == True:
            print('skipped bad lines\nx '+fname)
      except:
        print('... not read '+fname)
      pass
  i+=1
df_all.loc[df_all['TIMESTAMP_END']<2000, 'TIMESTAMP_END'] = np.nan
df_all = df_all[df_all['TIMESTAMP_END'].notna()]
if df_all.shape[0]==0:
  print('check folder path for files/ncheckdatetime of df_all')

df_time= combo.set_time_index(df_all)
df_time.drop_duplicates(inplace=True)
df_num = combo.to_numeric(df_time)
todays_date_yyyymmdd=str(datetime.date.today().year)+str(datetime.date.today().month).zfill(2)+str(datetime.date.today().day).zfill(2)
out_fname = out_path+field_name+'_'+todays_date_yyyymmdd+'.csv'
df_num.to_csv(out_fname)

print('file saved to: \t'+out_fname)
print('\nThe processed data for '+ field_name + ' has '+ str(df_num.shape[0]/48 / 365.25)+' years of data')

There are 163 files in the folder
