In [2]:
# Imports and logger setup
import awswrangler as wr
import pandas as pd
import numpy as np
import logging
import sweetviz as sv
from IPython.display import display

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()


# Read data from S3
def read_csv_from_s3(s3_path: str) -> pd.DataFrame:
    logger.info(f"Reading data from S3 path: {s3_path}")
    return wr.s3.read_csv(s3_path)


# Handle missing values
def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns

    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())

    for col in categorical_cols:
        if not df[col].mode().empty:
            df[col] = df[col].fillna(df[col].mode()[0])

    logger.info("Missing values handled: numeric columns filled with median and categorical columns filled with mode.")
    total_missing = df.isnull().sum().sum()
    logger.info(f"Total missing values after imputation: {total_missing}")
    return df


# Summary stats
def display_summary_stats(df: pd.DataFrame):
    logger.info("Data types after imputation:")
    logger.info(df.dtypes)
    logger.info("Summary statistics for numeric columns:")
    summary = df.describe().T[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
    display(summary)


# Analyze column group
def analyze_column_group(df, columns, group_name):
    logger.info(f"--- {group_name} Analysis ---")

    missing_values = df[columns].isnull().sum()
    logger.info("Missing values per column:")
    logger.info(missing_values)

    logger.info("Basic statistics:")
    display(df[columns].describe())

    logger.info("Data types:")
    logger.info(df[columns].dtypes)

    valid_cols = missing_values[missing_values < len(df) * 0.5].index.tolist()
    return valid_cols


# Correlation analysis
def correlation_analysis(df, energy_cols, methane_cols, temp_cols, pressure_cols):
    representative_cols = []
    if energy_cols:
        representative_cols.append(energy_cols[0])
    if methane_cols:
        representative_cols.append(methane_cols[0])
    if temp_cols:
        representative_cols.append(temp_cols[0])
    if pressure_cols:
        representative_cols.append(pressure_cols[0])

    logger.info("--- Correlation Analysis ---")
    if len(representative_cols) > 1:
        logger.info("Correlation between representative columns:")
        correlation = df[representative_cols].corr()
        display(correlation)
    else:
        logger.warning("Not enough valid columns for correlation analysis.")


# Run all steps
if __name__ == "__main__":
    s3_path = 's3://sagemaker-us-east-2-426179662034/canvas/processed/facility_merge_processed.csv'
    s3_output_path = 's3://sagemaker-us-east-2-426179662034/canvas/processed/facilities_cleaned.csv'

    # Step 1: Read and clean
    df = read_csv_from_s3(s3_path)
    df = handle_missing_values(df)

    # Step 2: Write cleaned data back to S3
    wr.s3.to_csv(df=df, path=s3_output_path, index=False)
    logger.info(f"Cleaned data written to {s3_output_path}")

    # Step 3: Display summary stats
    display_summary_stats(df)

    # Define column groups
    energy_cols = ['inr_fc_yestenergy_real', 'inr_fc_todayenergy_real', 'energy_output_btu']
    methane_cols = ['methane_percent', 'methane_smooth']
    temp_cols = [
        'bge_h2soutlet_temp', 'vl_comp_oilinjection_temp', 'vl_comp_suction_temp',
        'vl_comp_oilcooleroutlet_temp', 'vl_comp_aftercooloutlet_temp',
        'vl_comp_discharge_temp', 'abb_gc_outletstream_temp', 'vl_comp_oilseperator_temp',
        'bge_reheateroutlet_temp', 'bge_blowersuction_temp'
    ]
    pressure_cols = [
        'abb_gc_outletstream_pressure', 'vl_comp_oilfilterdiff_pressure', 'bge_airpressure_pressure',
        'vl_comp_netoildiff_pressure', 'bge_blowerdischarge_pressure', 'bge_pressurerun_target_sp',
        'vl_comp_suction_pressure', 'vl_comp_filterinlet_pressure', 'bge_blowersuction_pressure',
        'vl_comp_discharge_pressure', 'bge_skiddischarge_pressure', 'vl_comp_filteroutlet_pressure',
        'vl_comp_suctionpressure_sp'
    ]

    # Analyze each group
    valid_energy_cols = analyze_column_group(df, energy_cols, "Energy Output")
    valid_methane_cols = analyze_column_group(df, methane_cols, "Methane Measurements")
    valid_temp_cols = analyze_column_group(df, temp_cols, "Temperature Readings")
    valid_pressure_cols = analyze_column_group(df, pressure_cols, "Pressure Readings")

    # Correlation analysis
    correlation_analysis(df, valid_energy_cols, valid_methane_cols, valid_temp_cols, valid_pressure_cols)

    # # Step 5: Sweetviz auto-profiling
    # logger.info("Generating Sweetviz report...")
    # report = sv.analyze(df)
    # report.show_html('sweetviz_report.html')
    # logger.info("Sweetviz report saved as sweetviz_report.html")

    logger.info("Missing data handling, profiling, and export complete.")
    logger.info("Missing data handling and summary statistics analysis complete.")

2025-04-09 21:09:24,438 - INFO - Reading data from S3 path: s3://sagemaker-us-east-2-426179662034/canvas/processed/facility_merge_processed.csv
  df: pd.DataFrame = parser_func(f, **pandas_kwargs)
  df[col] = df[col].fillna(df[col].mode()[0])
2025-04-09 21:09:51,454 - INFO - Missing values handled: numeric columns filled with median and categorical columns filled with mode.
2025-04-09 21:09:51,538 - INFO - Total missing values after imputation: 0
2025-04-09 21:10:21,621 - INFO - Cleaned data written to s3://sagemaker-us-east-2-426179662034/canvas/processed/facilities_cleaned.csv
2025-04-09 21:10:21,622 - INFO - Data types after imputation:
2025-04-09 21:10:21,623 - INFO - bge_accumruntime                  float64
bge_inletvalve_opencmd               bool
bge_blowersuction_temp            float64
ch800_chiller_runtimer            float64
vl_comp_capacityslide_position    float64
                                   ...   
dig_level1_scaled                 float64
duration_min             

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bge_accumruntime,333316.0,10329.865275,4.338124e+03,2.046000e+03,6971.000000,10034.000000,14307.000000,1.792700e+04
bge_blowersuction_temp,333316.0,68.600553,2.025531e+01,2.730000e+01,52.500000,65.099998,80.199997,1.370000e+02
ch800_chiller_runtimer,333316.0,0.652054,1.660497e+00,0.000000e+00,0.000000,0.000000,0.000000,5.000000e+00
vl_comp_capacityslide_position,333316.0,83.734207,3.089826e+01,-4.347826e+00,82.673912,100.000000,100.000000,1.000000e+02
vl_comp_suctionpressure_sp,333316.0,0.789488,2.356497e-01,5.000000e-01,0.610000,0.740000,1.008100,1.500001e+00
...,...,...,...,...,...,...,...,...
dig_level1_scaled,333316.0,26.282423,6.308694e+00,2.352941e+00,26.823528,26.823528,26.823528,6.734118e+01
duration_min,333316.0,3.183664,4.252597e+01,1.500000e-07,1.650013,3.250242,4.983259,2.441342e+04
methane_smooth,333316.0,72.664823,3.840550e+00,0.000000e+00,70.010002,72.580002,75.779999,9.763000e+01
flow_smooth,333316.0,68.859817,3.255134e+01,0.000000e+00,56.689999,70.449997,90.639999,2.068600e+02


2025-04-09 21:10:22,737 - INFO - --- Energy Output Analysis ---
2025-04-09 21:10:22,744 - INFO - Missing values per column:
2025-04-09 21:10:22,744 - INFO - inr_fc_yestenergy_real     0
inr_fc_todayenergy_real    0
energy_output_btu          0
dtype: int64
2025-04-09 21:10:22,745 - INFO - Basic statistics:


Unnamed: 0,inr_fc_yestenergy_real,inr_fc_todayenergy_real,energy_output_btu
count,333316.0,333316.0,333316.0
mean,73699.55982,36465.025207,163233.2
std,27628.246591,25887.176201,3534114.0
min,0.0,0.0,0.0
25%,60719.34375,16452.899902,46217.3
50%,74569.617188,33115.017578,155465.5
75%,92764.34375,52298.476562,238748.0
max,148371.234375,147996.828125,2036573000.0


2025-04-09 21:10:22,796 - INFO - Data types:
2025-04-09 21:10:22,803 - INFO - inr_fc_yestenergy_real     float64
inr_fc_todayenergy_real    float64
energy_output_btu          float64
dtype: object
2025-04-09 21:10:22,804 - INFO - --- Methane Measurements Analysis ---
2025-04-09 21:10:22,811 - INFO - Missing values per column:
2025-04-09 21:10:22,812 - INFO - methane_percent    0
methane_smooth     0
dtype: int64
2025-04-09 21:10:22,813 - INFO - Basic statistics:


Unnamed: 0,methane_percent,methane_smooth
count,333316.0,333316.0
mean,72.663592,72.664823
std,3.888241,3.84055
min,0.0,0.0
25%,70.0,70.010002
50%,72.580002,72.580002
75%,75.75,75.779999
max,97.629997,97.629997


2025-04-09 21:10:22,847 - INFO - Data types:
2025-04-09 21:10:22,852 - INFO - methane_percent    float64
methane_smooth     float64
dtype: object
2025-04-09 21:10:22,853 - INFO - --- Temperature Readings Analysis ---
2025-04-09 21:10:22,875 - INFO - Missing values per column:
2025-04-09 21:10:22,876 - INFO - bge_h2soutlet_temp              0
vl_comp_oilinjection_temp       0
vl_comp_suction_temp            0
vl_comp_oilcooleroutlet_temp    0
vl_comp_aftercooloutlet_temp    0
vl_comp_discharge_temp          0
abb_gc_outletstream_temp        0
vl_comp_oilseperator_temp       0
bge_reheateroutlet_temp         0
bge_blowersuction_temp          0
dtype: int64
2025-04-09 21:10:22,877 - INFO - Basic statistics:


Unnamed: 0,bge_h2soutlet_temp,vl_comp_oilinjection_temp,vl_comp_suction_temp,vl_comp_oilcooleroutlet_temp,vl_comp_aftercooloutlet_temp,vl_comp_discharge_temp,abb_gc_outletstream_temp,vl_comp_oilseperator_temp,bge_reheateroutlet_temp,bge_blowersuction_temp
count,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0
mean,66.429568,166.270981,89.430261,180.488154,76.687501,201.929137,71.249801,188.523122,44.706933,68.600553
std,16.77833,44.186977,23.074651,44.454277,18.888383,43.576752,16.448567,24.137441,7.598279,20.25531
min,7.8,-76.900002,-76.900002,-76.900002,26.719246,34.728127,27.4,43.941864,28.889999,27.299999
25%,53.099998,140.473633,69.85849,193.629883,63.372753,210.616241,59.560001,189.401001,40.299999,52.5
50%,64.5,193.795242,88.687622,195.307251,76.922379,216.546143,70.940002,195.189133,42.700001,65.099998
75%,78.889999,196.464874,106.62001,198.118622,88.545883,220.963989,81.419998,200.174011,47.099998,80.199997
max,120.5,229.705231,161.419998,332.521271,151.477753,254.298859,137.639999,223.184753,103.800003,137.0


2025-04-09 21:10:23,066 - INFO - Data types:
2025-04-09 21:10:23,084 - INFO - bge_h2soutlet_temp              float64
vl_comp_oilinjection_temp       float64
vl_comp_suction_temp            float64
vl_comp_oilcooleroutlet_temp    float64
vl_comp_aftercooloutlet_temp    float64
vl_comp_discharge_temp          float64
abb_gc_outletstream_temp        float64
vl_comp_oilseperator_temp       float64
bge_reheateroutlet_temp         float64
bge_blowersuction_temp          float64
dtype: object
2025-04-09 21:10:23,086 - INFO - --- Pressure Readings Analysis ---
2025-04-09 21:10:23,119 - INFO - Missing values per column:
2025-04-09 21:10:23,119 - INFO - abb_gc_outletstream_pressure      0
vl_comp_oilfilterdiff_pressure    0
bge_airpressure_pressure          0
vl_comp_netoildiff_pressure       0
bge_blowerdischarge_pressure      0
bge_pressurerun_target_sp         0
vl_comp_suction_pressure          0
vl_comp_filterinlet_pressure      0
bge_blowersuction_pressure        0
vl_comp_discharge_press

Unnamed: 0,abb_gc_outletstream_pressure,vl_comp_oilfilterdiff_pressure,bge_airpressure_pressure,vl_comp_netoildiff_pressure,bge_blowerdischarge_pressure,bge_pressurerun_target_sp,vl_comp_suction_pressure,vl_comp_filterinlet_pressure,bge_blowersuction_pressure,vl_comp_discharge_pressure,bge_skiddischarge_pressure,vl_comp_filteroutlet_pressure,vl_comp_suctionpressure_sp
count,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0,333316.0
mean,82.337131,3.700343,135.218214,75.574711,22.455851,25.904596,0.778752,80.054111,0.452167,86.649752,62.72442,76.353574,0.789488
std,16.117675,4.05693,14.754235,10.381441,12.021241,10.160272,0.388732,11.472993,3.96889,13.542667,8.391642,10.529734,0.23565
min,-0.74,-75.366455,0.08,-35.853165,-1.25,1.0,-35.420799,-0.447521,-17.24,-35.424995,-4.35,-35.445961,0.5
25%,79.510002,0.790154,125.160004,74.589722,14.9,18.799999,0.666255,80.017296,-0.07,85.142685,55.189999,75.410912,0.61
50%,87.090004,2.976128,135.880005,77.675751,22.25,26.0,0.769884,81.675285,0.06,90.017097,61.16,78.41967,0.74
75%,90.440002,4.168003,147.210007,80.373253,29.58,31.299999,0.951225,85.405785,0.16,92.966377,69.889999,81.194977,1.0081
max,247.62001,85.679909,162.149994,135.932587,72.360001,70.5,3.179141,143.928009,68.379997,145.581955,89.849998,137.738708,1.500001


2025-04-09 21:10:23,348 - INFO - Data types:
2025-04-09 21:10:23,372 - INFO - abb_gc_outletstream_pressure      float64
vl_comp_oilfilterdiff_pressure    float64
bge_airpressure_pressure          float64
vl_comp_netoildiff_pressure       float64
bge_blowerdischarge_pressure      float64
bge_pressurerun_target_sp         float64
vl_comp_suction_pressure          float64
vl_comp_filterinlet_pressure      float64
bge_blowersuction_pressure        float64
vl_comp_discharge_pressure        float64
bge_skiddischarge_pressure        float64
vl_comp_filteroutlet_pressure     float64
vl_comp_suctionpressure_sp        float64
dtype: object
2025-04-09 21:10:23,373 - INFO - --- Correlation Analysis ---
2025-04-09 21:10:23,374 - INFO - Correlation between representative columns:


Unnamed: 0,inr_fc_yestenergy_real,methane_percent,bge_h2soutlet_temp,abb_gc_outletstream_pressure
inr_fc_yestenergy_real,1.0,-0.354512,0.232757,0.347154
methane_percent,-0.354512,1.0,-0.479008,-0.010825
bge_h2soutlet_temp,0.232757,-0.479008,1.0,0.082352
abb_gc_outletstream_pressure,0.347154,-0.010825,0.082352,1.0


2025-04-09 21:10:23,403 - INFO - Missing data handling, profiling, and export complete.
2025-04-09 21:10:23,404 - INFO - Missing data handling and summary statistics analysis complete.
