In [1]:
import pandas as pd
import numpy as np

In [3]:
# --- Define min/max valid values for each field ---
#isn't filled  bc some codes extended
VALID_RANGES = {
    # 'yr': (51, 99),
}

# --- Fields and their positions (tuple for inclusive start:exclusive end) ---
FIELD_POSITIONS = {
    'yr': (0, 2),
    'mo': (2, 4),
    'dy': (4, 6),
    'hr': (6, 8),
    'IB': (8, 9),
    'Lat': (9, 14),
    'Lon': (14, 19),
    'ID': (19, 24),
    'LO': (24, 25),
    'ww': (25, 27),
    'N': (27, 28),
    'Nh': (28, 30),
    'h': (30, 32),
    'CL': (32, 34),
    'CM': (34, 36),
    'CH': (36, 38),
    'AM': (38, 41),
    'AH': (41, 44),
    'UM': (44, 45),
    'UH': (45, 46),
    'IC': (46, 48),
    'SA': (48, 52),
    'RI': (52, 56),
    'SLP': (56, 61),
    'WS': (61, 64),
    'WD': (64, 67),
    'AT': (67, 71),
    'DD': (71, 74),
    'EL_SST': (74, 78),
    'IW': (78, 79),
    'IP_IH': (79, 80)
}

MISSING_VALUE_FLAGS = {
    'ID': ['9'],
    'ww': ['-1'],
    'Nh': ['-1'],
    'h': ['-1'],
    'CL': ['-1'],
    'CM': ['-1'],
    'CH': ['-1'],
    'AM': ['900'],
    'AH': ['900'],
    'UM': ['9'],
    'UH': ['9'],
    'SLP': ['-1'],
    'WS': ['-1'],
    'WD': ['-1'],
    'AT': ['900'],
    'DD': ['900'],
    'EL_SST': ['9000'],
    'IW': ['9'],
    'IP_IH': ['9'],
}

In [4]:
def clean_and_check(val, key):
    """Cleans section of sequence and checks whether valid

    Args:
        val (str): section of sequence corresponding to var
        key (str): var name

    Returns:
        int or NaN: value
        boolean: True if valid value
    """
    raw = val.strip().lower()
    
    if key in MISSING_VALUE_FLAGS and raw in MISSING_VALUE_FLAGS[key]:
        return (np.nan, True)  # Mark as missing, but valid

    try:
        intval = int(raw)
    except:
        return (np.nan, False)  # Cannot parse

    # Check if in range
    if key in VALID_RANGES:
        min_val, max_val = VALID_RANGES[key]
        return (intval, min_val <= intval <= max_val)
    
    return (intval, True)  # If no range given, accept

In [5]:
def parse_sequence(seq):
    """genereates clean dictionary from one sequence

    Args:
        seq (string): one 80-char line of data

    Returns:
        dictionary: var names and cleaned values
    """
    seq = seq.strip()
    
    if len(seq) != 80:
        print(len(seq))
        print(f"Less than 80-char, skipping: {seq}")
        return None

    parsed = {}

    # Populate parsed with key and value
    for key, (start, end) in FIELD_POSITIONS.items():
        parsed[key] = seq[start:end]

    # Clean value and repopulate
    for key in list(parsed.keys()):
        val, valid = clean_and_check(parsed[key], key)
        parsed[key] = val
        if not valid:
            print(f"Invalid field {key} in sequence: {seq}")
            return None

    #Create seperate keys for el_sst/ip_ih and add values
    if parsed['LO'] == 1: #land
        parsed['EL'] = parsed['EL_SST']
        parsed['SST'] = np.nan
        parsed['IP'] = parsed['IP_IH']
        parsed['IH'] = np.nan
        
    elif parsed['LO'] == 2: #ocean
        parsed['EL'] = np.nan
        parsed['SST'] = parsed['EL_SST']
        parsed['IP'] = np.nan
        parsed['IH'] = parsed['IP_IH']
        
    else:
        print(f"Invalid LO in sequence: {seq}")
        return None

    # Remove EL_SST raw field
    del parsed['EL_SST']
    del parsed['IP_IH']

    return parsed


In [6]:
def parse_file_to_df(path):
    """_summary_

    Args:
        path (_type_): _description_

    Returns:
        _type_: _description_
    """
    
    with open(path, 'r') as f:
        lines = f.readlines()

    parsed_data = []

    for line in lines:
        result = parse_sequence(line)
        if result is not None:
            parsed_data.append(result)

    df = pd.DataFrame(parsed_data)
    return df


In [None]:
df_JAN71L = parse_file_to_df("data/EECRA/land_197101_197704/JAN71L")
df_JAN71L.to_csv("data/EECRA/land/df_JAN71L.csv", index=False)

In [32]:
#TODO different for before x
df_DEC09L = parse_file_to_df("data/EECRA/land_199701_200912/DEC09L")
df_DEC09L.to_csv("data/EECRA/land/df_DEC09L.csv", index=False)

In [9]:
name= "AUG52O" #APR52O
df_MonO = parse_file_to_df("data/EECRA/ship_195112_199712/"+name)
df_MonO.to_csv("data/EECRA/ocean/df_"+name+".csv", index=False)