# Loading LiPD files for subannual filter implementation

In [46]:
import lipd
import os
import numpy as np
import pandas as pd

In [47]:
folder = "/Users/tanaya/lmr_presto/data/Pages2kTemperature2_1_2"
lipds = lipd.readLipd(folder) 

Disclaimer: LiPD files may be updated and modified to adhere to standards

Found: 647 LiPD file(s)
reading: Ant-WDC05A.Steig.2013.lpd
reading: NAm-MtLemon.Briffa.2002.lpd
reading: Arc-Arjeplog.Bjorklund.2014.lpd
reading: Asi-CHIN019.Li.2010.lpd
reading: NAm-Landslide.Luckman.2006.lpd
reading: NAm-SmithersSkiArea.Schweingruber.1996.lpd
reading: Asi-GANGCD.PAGES2k.2013.lpd
reading: Ocn-Mayotte.Zinke.2008.lpd
reading: Ocn-LosRoques.Hetzinger.2008.lpd
reading: Arc-Agassiz.Vinther.2008.lpd
reading: SAm-LagunaChepical.deJong.2013.lpd
reading: NAm-CoppermineRiver.Jacoby.1989.lpd
reading: NAm-KobukNoatak.King.2003.lpd
reading: Asi-BT005.Cook.2010.lpd
reading: SAm-CentralAndes9.Mundo.2014.lpd
reading: Asi-JAPA016.Yasue.2013.lpd
reading: Asi-CHIN049.Brauning.2013.lpd
reading: NAm-MeadowMountain.Wilson.2005.lpd
reading: NAm-wa069.Peterson.1994.lpd
reading: NAm-PowderRiverPass.Briffa.1996.lpd
reading: Asi-YKCOM.Yasue.2013.lpd
reading: Arc-MackenzieDelta.Porter.2013.lpd
reading: Ocn-BunakenIsland.C

### Understanding format

In [16]:
print(list(lipds.keys())[:5])

['Ant-WDC05A.Steig.2013', 'NAm-MtLemon.Briffa.2002', 'Arc-Arjeplog.Bjorklund.2014', 'Asi-CHIN019.Li.2010', 'NAm-Landslide.Luckman.2006']


In [29]:
rec = lipds['Ocn-Mayotte.Zinke.2008']

In [None]:
def extract_lipd_info(rec):
    try:
        rec1 = rec['paleoData']['paleo0']
        rec2 = rec1['measurementTable']['paleo0measurement0']
        cols = rec2['columns']

        var_name = list(cols.keys())[0]
        var_info = cols[var_name]

        resolution = var_info.get('paleoData_TemporalResolution', 'Unknown')
        tsid = var_info.get('TSid', 'No TSid')
        archive = var_info.get('archiveType', 'Unknown archive')
        units = var_info.get('paleoDataUnits', 'Unknown units')

        print(f"TSid: {tsid}")
        print(f"Variable: {var_name}")
        print(f"Resolution: {resolution}")
        print(f"Archive Type: {archive}")
        print(f"Units: {units}")
        print(f"Mean Value: {var_info.get('hasMeanValue')}")
        print(f"Value Range: ({var_info.get('hasMinValue')}, {var_info.get('hasMaxValue')})")

    except Exception as e:
        print("Failed to extract from record:", e)

In [31]:
extract_lipd_info(rec)


TSid: Ocean2kHR_002
Variable: d18O
Resolution: Unknown
Archive Type: coral
Units: Unknown units
Mean Value: -4.885234070221066
Value Range: (5.49, -4.35)


## Function for subannual filterig (compute_annual_means) replicated from LMR

In [66]:
def extract_years_and_values(lipd_record):
    try:
        columns = lipd_record['paleoData']['paleo0']['measurementTable']['paleo0measurement0']['columns']
        
        year_col = None
        for key, coldata in columns.items():
            if key.lower() in ['year', 'time', 'age'] and 'values' in coldata:
                year_col = coldata['values']
                break

        if year_col is None:
            print("No time column found.")
            return None, None, None

        # Now find a value column with the same length
        for key, coldata in columns.items():
            if key.lower() in ['year', 'time', 'age']:
                continue
            values = coldata.get('values')
            if values and len(values) == len(year_col):
                return key, np.array(year_col, dtype=float), np.array(values, dtype=float)

        print("No value column matched time length.")
        return None, None, None

    except Exception as e:
        print("Error parsing LiPD:", e)
        return None, None, None


In [None]:
def compute_annual_means_lipd(lipd_record, pid=None, valid_frac=0.5):
    varname, time, value = extract_years_and_values(lipd_record)
    if time is None or value is None:
        return None  # Could not extract

    if pid is None:
        pid = lipd_record.get("dataSetName", "unknown")

    
    return lipd_record

    # Drop NaNs
    mask = ~np.isnan(time) & ~np.isnan(value)
    time = time[mask]
    value = value[mask]

    if len(time) < 2:
        return None

    dt = np.median(np.diff(time))
    subannual = dt < 0.95

    if subannual:
        df = pd.DataFrame({'year': time.astype(int), 'value': value})
        annual = df.groupby('year').mean().dropna()
        year_range = annual.index.max() - annual.index.min() + 1
        coverage = len(annual) / year_range if year_range > 0 else 0
        if coverage < valid_frac:
            return None
        lipd_record['annual_time'] = annual.index.tolist()
        lipd_record['annual_value'] = annual['value'].tolist()
        lipd_record['is_subannual'] = True
        return lipd_record

    else:
        years = time.astype(int)
        year_range = years.max() - years.min() + 1
        coverage = len(years) / year_range if year_range > 0 else 0
        if coverage < valid_frac:
            return None
        lipd_record['annual_time'] = years.tolist()
        lipd_record['annual_value'] = value.tolist()
        lipd_record['is_subannual'] = False
        return lipd_record


In [None]:
filtered_lipds = []

for key, rec in lipd.readLipd(folder).items():
    filtered = compute_annual_means_lipd(rec, pid=key)
    if filtered is not None:
        filtered_lipds.append(filtered)

print(f"Kept {len(filtered_lipds)} filtered records.")

Disclaimer: LiPD files may be updated and modified to adhere to standards

Found: 647 LiPD file(s)
reading: Ant-WDC05A.Steig.2013.lpd
reading: NAm-MtLemon.Briffa.2002.lpd
reading: Arc-Arjeplog.Bjorklund.2014.lpd
reading: Asi-CHIN019.Li.2010.lpd
reading: NAm-Landslide.Luckman.2006.lpd
reading: NAm-SmithersSkiArea.Schweingruber.1996.lpd
reading: Asi-GANGCD.PAGES2k.2013.lpd
reading: Ocn-Mayotte.Zinke.2008.lpd
reading: Ocn-LosRoques.Hetzinger.2008.lpd
reading: Arc-Agassiz.Vinther.2008.lpd
reading: SAm-LagunaChepical.deJong.2013.lpd
reading: NAm-CoppermineRiver.Jacoby.1989.lpd
reading: NAm-KobukNoatak.King.2003.lpd
reading: Asi-BT005.Cook.2010.lpd
reading: SAm-CentralAndes9.Mundo.2014.lpd
reading: Asi-JAPA016.Yasue.2013.lpd
reading: Asi-CHIN049.Brauning.2013.lpd
reading: NAm-MeadowMountain.Wilson.2005.lpd
reading: NAm-wa069.Peterson.1994.lpd
reading: NAm-PowderRiverPass.Briffa.1996.lpd
reading: Asi-YKCOM.Yasue.2013.lpd
reading: Arc-MackenzieDelta.Porter.2013.lpd
reading: Ocn-BunakenIsland.C

In [69]:
output_folder = './data/filtered_lipd'
os.makedirs(output_folder, exist_ok=True)

In [73]:
# convert list into dictioary 
lipds_dict = {}


for rec in filtered_lipds:
    key = rec.get("dataSetName", None)
    if key is None:
        continue  
    lipds_dict[key] = rec

In [None]:
lipd.writeLipd(lipds_dict, output_folder)

writing: Ant-WDC05A.Steig.2013
writing: NAm-MtLemon.Briffa.2002
writing: Arc-Arjeplog.Bjorklund.2014
writing: Asi-CHIN019.Li.2010
writing: NAm-Landslide.Luckman.2006
writing: NAm-SmithersSkiArea.Schweingruber.1996
writing: Asi-GANGCD.PAGES2k.2013
writing: Ocn-Mayotte.Zinke.2008
writing: Ocn-LosRoques.Hetzinger.2008
writing: Arc-Agassiz.Vinther.2008
writing: SAm-LagunaChepical.deJong.2013
Error: lipd_write: File size too large, try using force_zip64
writing: NAm-CoppermineRiver.Jacoby.1989
writing: NAm-KobukNoatak.King.2003
writing: Asi-BT005.Cook.2010
writing: SAm-CentralAndes9.Mundo.2014
writing: Asi-JAPA016.Yasue.2013
writing: Asi-CHIN049.Brauning.2013
writing: NAm-MeadowMountain.Wilson.2005
writing: NAm-wa069.Peterson.1994
writing: NAm-PowderRiverPass.Briffa.1996
writing: Asi-YKCOM.Yasue.2013
writing: Arc-MackenzieDelta.Porter.2013
writing: Ocn-BunakenIsland.Charles.2003
writing: Asi-NEPA025.Krusic.2013
writing: Asi-BARELC.PAGES2k.2013
writing: Asi-PAKI029.Cook.2013
writing: Asi-NEP