In [1]:
!pip install openpyxl



In [2]:
import pandas as pd
import io
import os

# --- CONFIGURATION ---
# Since we are working locally in SageMaker, we set this to False
IS_AWS = False 
input_file = "ILIDataV2.xlsx"
years = ["2007", "2015", "2022"]

mappings = {
    "2007": {"log dist. [ft]": "distance", "o'clock": "clock", "event": "feature_type", "depth [%]": "depth", "length [in]": "length", "width [in]": "width"},
    "2015": {"Log Dist. [ft]": "distance", "O'clock": "clock", "Event Description": "feature_type", "Depth [%]": "depth", "Length [in]": "length", "Width [in]": "width"},
    "2022": {"ILI Wheel Count \n[ft.]": "distance", "O'clock\n[hh:mm]": "clock", "Event Description": "feature_type", "Metal Loss Depth \n[%]": "depth", "Length [in]": "length", "Width [in]": "width"}
}

def run_standardization():
    try:
        # 1. LOAD DATA Local mode
        print(f"üêß Opening {input_file}...")
        with open(input_file, "rb") as f:
            excel_data = f.read()

        for year in years:
            # Read sheet
            df = pd.read_excel(io.BytesIO(excel_data), sheet_name=year)
            
            # Apply Gia's mappings
            df = df.rename(columns=mappings[year])
            df['survey_year'] = int(year)
            
            # Keep only the columns needed for alignment
            target_cols = ["distance", "clock", "feature_type", "depth", "length", "width", "survey_year"]
            existing_cols = [c for c in target_cols if c in df.columns]
            df_final = df[existing_cols]

            # 2. SAVE DATA locally
            output_folder = "standardized"
            output_filename = f"ILI_{year}_cleaned.csv"
            
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)
            
            df_final.to_csv(os.path.join(output_folder, output_filename), index=False)
            print(f"‚úÖ Processed {year} successfully. Saved to {output_folder}/{output_filename}")

        return "Standardization complete."

    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        return str(e)

# Run the process
run_standardization()

üêß Opening ILIDataV2.xlsx...
‚úÖ Processed 2007 successfully. Saved to standardized/ILI_2007_cleaned.csv
‚úÖ Processed 2015 successfully. Saved to standardized/ILI_2015_cleaned.csv
‚úÖ Processed 2022 successfully. Saved to standardized/ILI_2022_cleaned.csv


'Standardization complete.'

In [5]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d

# Load the standardized CSVs
df_07 = pd.read_csv('standardized/ILI_2007_cleaned.csv')
df_15 = pd.read_csv('standardized/ILI_2015_cleaned.csv')
df_22 = pd.read_csv('standardized/ILI_2022_cleaned.csv')

def align_with_master_anchor(df_source, df_ref):
    # 1. Find the first Valve in both (This is our "True Zero")
    anchor_src_val = df_source[df_source['feature_type'].str.contains('Valve', na=False)]['distance'].iloc[0]
    anchor_ref_val = df_ref[df_ref['feature_type'].str.contains('Valve', na=False)]['distance'].iloc[0]
    
    # 2. Filter Girth Welds that appear AFTER the valve
    anchors_src = df_source[(df_source['feature_type'].str.contains('Girth', na=False)) & 
                            (df_source['distance'] > anchor_src_val)]
    anchors_ref = df_ref[(df_ref['feature_type'].str.contains('Girth', na=False)) & 
                         (df_ref['distance'] > anchor_ref_val)]
    
    # 3. Create staple points (adding the Valve at the start)
    old_x = [anchor_src_val] + anchors_src['distance'].tolist()
    new_x = [anchor_ref_val] + anchors_ref['distance'].tolist()
    
    # Match indices
    n = min(len(old_x), len(new_x))
    transform = interp1d(old_x[:n], new_x[:n], fill_value="extrapolate")
    
    # 4. Transform the distances
    df_source['aligned_dist'] = transform(df_source['distance'])
    return df_source

# Run the improved alignment
print("üêß Re-aligning with Valve-First Strategy...")
df_07_final = align_with_master_anchor(df_07, df_22)
df_15_final = align_with_master_anchor(df_15, df_22)

# --- NEW SANITY CHECK ---
v_22 = df_22[df_22['feature_type'].str.contains('Valve', na=False)]['distance'].iloc[0]
v_07_aligned = df_07_final[df_07_final['feature_type'].str.contains('Valve', na=False)]['aligned_dist'].iloc[0]

print(f"\n--- Improved Valve Sanity Check ---")
print(f"2022 Distance: {v_22:.4f} ft")
print(f"2007 Aligned:  {v_07_aligned:.4f} ft")
print(f"New Error:     {abs(v_22 - v_07_aligned):.4f} ft (Target: < 0.1)")

# Save the final mapped files
df_07_final.to_csv('standardized/ILI_2007_aligned.csv', index=False)
df_15_final.to_csv('standardized/ILI_2015_aligned.csv', index=False)

üêß Re-aligning with Valve-First Strategy...

--- Improved Valve Sanity Check ---
2022 Distance: -0.0030 ft
2007 Aligned:  -0.0030 ft
New Error:     0.0000 ft (Target: < 0.1)


In [6]:
v_22 = df_22[df_22['feature_type'].str.contains('Valve', na=False)]['distance'].iloc[0]
v_15_aligned = df_15_final[df_15_final['feature_type'].str.contains('Valve', na=False)]['aligned_dist'].iloc[0]

print(f"--- 2015 Sanity Check ---")
print(f"Accuracy: {abs(v_22 - v_15_aligned):.4f} ft error")

--- 2015 Sanity Check ---
Accuracy: 0.0000 ft error
