In [1]:
import pandas as pd
import numpy as np
import os

In [None]:
# --- Fields and their positions (tuple for inclusive start:exclusive end) ---
FIELD_POSITIONS_52 = {
    'yr': (0, 2),
    'mo': (2, 4),
    'dy': (4, 6),
    'hr': (6, 8),
    'IB': (8, 9),
    'Lat': (9, 14),
    'Lon': (14, 19),
    'ID': (19, 24),
    'LO': (24, 25),
    'ww': (25, 27),
    'N': (27, 28),
    'Nh': (28, 30),
    'h': (30, 32),
    'CL': (32, 34),
    'CM': (34, 36),
    'CH': (36, 38),
    'AM': (38, 41),
    'AH': (41, 44),
    'UM': (44, 45),
    'UH': (45, 46),
    'IC': (46, 48),
    'SA': (48, 52),
    'RI': (52, 56),
    'SLP': (56, 61),
    'WS': (61, 64),
    'WD': (64, 67),
    'AT': (67, 71),
    'DD': (71, 74),
    'EL_SST': (74, 78), #only sst used since only parsing ocean
    'IW': (78, 79),
    'IP_IH': (79, 80)
}

MISSING_VALUE_FLAGS = {
    'ID': ['9'],
    'ww': ['-1'],
    'Nh': ['-1'],
    'h': ['-1'],
    'CL': ['-1'],
    'CM': ['-1'],
    'CH': ['-1'],
    'AM': ['900'],
    'AH': ['900'],
    'UM': ['9'],
    'UH': ['9'],
    'SLP': ['-1'],
    'WS': ['-1'],
    'WD': ['-1'],
    'AT': ['900'],
    'DD': ['900'],
    'EL_SST': ['9000'],
    'IW': ['9'],
    'IP_IH': ['9'],
}

In [None]:
def clean_and_check(val, key):
    """Cleans section of sequence and checks whether valid

    Args:
        val (str): section of sequence corresponding to var
        key (str): var name

    Returns:
        int or NaN: value
        boolean: True if valid value
    """
    raw = val.strip().lower()
    
    if key in MISSING_VALUE_FLAGS and raw in MISSING_VALUE_FLAGS[key]:
        return (np.nan, True)  # Mark as missing, but valid

    try:
        intval = int(raw)
    except:
        return (np.nan, False)  # Cannot parse

    # Check if in range
    if key in VALID_RANGES:
        min_val, max_val = VALID_RANGES[key]
        return (intval, min_val <= intval <= max_val)
    
    return (intval, True)  # If no range given, accept

In [None]:
def parse_sequence(seq, fieldPositions):
    """generates clean dictionary of var names and the value from one sequence

    Args:
        seq (string): one 80-char line of data
        fieldPositions (dictionary): the Fields and their positions, there are 
        two different dictionaries for 1952-97 and 1997-2008

    Returns:
        dictionary: var names and cleaned values
    """
    seq = seq.strip()
    
    if len(seq) != 80:
        print(len(seq))
        print(f"Less than 80-char, skipping: {seq}")
        return None

    parsed = {}

    # Populate parsed with key and value
    for key, (start, end) in fieldPositions.items():
        parsed[key] = seq[start:end]

    # Clean value and repopulate
    for key in list(parsed.keys()):
        val, valid = clean_and_check(parsed[key], key)
        parsed[key] = val
        if not valid:
            print(f"Invalid field {key} in sequence: {seq}")
            return None

    #Create seperate keys for el_sst/ip_ih and add values
    if parsed['LO'] == 2: #should only be parsing ocean
        parsed['EL'] = np.nan
        parsed['SST'] = parsed['EL_SST']
        parsed['IP'] = np.nan
        parsed['IH'] = parsed['IP_IH']
        
    else:
        print(f"Invalid LO in sequence: {seq}")
        return None

    # Remove EL_SST raw field
    del parsed['EL_SST']
    del parsed['IP_IH']

    return parsed


In [None]:
def parse_file_to_df(path):
    """
    Args:
        path (string): path to file

    Returns:
        df
    """
    
    with open(path, 'r') as f:
        lines = f.readlines()

    parsed_data = []

    for line in lines:
        result = parse_sequence(line)
        if result is not None:
            parsed_data.append(result)

    df = pd.DataFrame(parsed_data)
    return df


In [7]:
djf = ["DEC", "JAN", "FEB"] #remember this bleeds into another yr
mam = ["MAR", "APR", "MAY"] 
jja = ["JUN", "JUL", "AUG"]
son = ["SEP", "OCT", "NOV"]

seasons = [djf, mam, jja, son]

In [11]:
# processs a data/EECRA/ship_195112_199712

#for fn in os.listdir("data/EECRA/ship_195112_199712"):
#    df_MonO = parse_file_to_df("data/EECRA/ship_195112_199712/"+fn)
#    df_MonO.to_csv("data/EECRA/ocean/df_"+fn+".csv", index=False)
    

#df2 = pd.concat(map(pd.read_csv, [path + 'df_JUN52O.csv', path + 'df_JUL52O.csv', path + 'df_AUG52O.csv']))

files = os.listdir("data/EECRA/ship_195112_199712")
    
for yr in range(52, 97 + 1):
    for season in seasons:
        
        if season[0] == "DEC": 
            fn1 = season[0] + str(yr - 1) + "O"   #djf99 is dec 98, jan 99, and feb 99
        else:
            fn1 = season[0] + str(yr) + "O"
            
        fn2 = season[1] + str(yr) + "O"
        fn3 = season[2] + str(yr) + "O"
        
        if fn1 not in files or fn2 not in files or fn3 not in files:
            print ("no", str(season), "in yr", str(yr))
            continue
        
        try:
            df1 = parse_file_to_df("data/EECRA/ship_195112_199712/"+fn1)
            df2 = parse_file_to_df("data/EECRA/ship_195112_199712/"+fn2)
            df3 = parse_file_to_df("data/EECRA/ship_195112_199712/"+fn3)
        except Exception as e:
            print(fn2, "season files corrupted?")
            print(e)
            continue
            
        try:
            seasondf = pd.concat([df1, df2, df3])
            seasondf.to_csv("data/EECRA/ocean_seasonal/df_"+season[0][0]+season[1][0]+season[2][0]+str(yr)+".csv", index=False)
        except Exception as e:
            print("failed to save", str(season), "in yr", str(yr))
            print(e)

In [14]:
#made oopsies, need to divide each lat/lon by 100
path = "data/EECRA/ocean_seasonal/"

for file in os.listdir(path):
    print(file)
    try:
        df = pd.read_csv(path+file)
        df['Lat'] = df['Lat'] / 100
        df['Lon'] = df['Lon'] / 100
        df.to_csv(path+file, index=False)
    except Exception as e:
        print(e)
        print("failed:"+file)

df_JJA57.csv
df_JJA80.csv
df_JJA94.csv
df_DJF66.csv
df_DJF72.csv
df_DJF73.csv
df_DJF67.csv
df_JJA95.csv
df_JJA81.csv
df_JJA56.csv
df_JJA68.csv
df_JJA54.csv
df_JJA97.csv
df_JJA83.csv
df_DJF59.csv
df_DJF71.csv
df_DJF65.csv
df_DJF64.csv
df_DJF70.csv
df_DJF58.csv
df_JJA82.csv
df_JJA96.csv
df_JJA55.csv
df_JJA69.csv
df_JJA79.csv
df_JJA92.csv
df_JJA86.csv
df_DJF74.csv
df_DJF60.csv
df_DJF61.csv
df_DJF75.csv
df_JJA87.csv
df_JJA93.csv
df_JJA78.csv
df_JJA52.csv
df_JJA85.csv
df_JJA91.csv
df_DJF88.csv
df_DJF63.csv
df_DJF77.csv
df_DJF76.csv
df_DJF62.csv
df_DJF89.csv
df_JJA90.csv
df_JJA84.csv
df_JJA53.csv
df_SON71.csv
df_MAM64.csv
df_MAM70.csv
df_SON65.csv
df_MAM58.csv
df_SON59.csv
df_SON58.csv
df_MAM59.csv
df_SON64.csv
df_MAM71.csv
df_MAM65.csv
df_SON70.csv
df_MAM73.csv
df_SON66.csv
df_SON72.csv
df_MAM67.csv
df_MAM66.csv
df_SON73.csv
df_SON67.csv
df_MAM72.csv
df_MAM76.csv
df_SON63.csv
df_SON77.csv
df_MAM62.csv
df_MAM89.csv
df_SON88.csv
df_SON89.csv
df_MAM88.csv
df_MAM63.csv
df_SON76.csv
df_SON62.csv