In [81]:
import os
import glob
import pandas as pd
import numpy as np

input_folder = '/Users/geunchansong/Documents/TU:d/Year 2/Graduation Thesis/Weather Sensor/Processed'
output_folder = '/Users/geunchansong/Documents/TU:d/Year 2/Graduation Thesis/Weather Sensor/Cleaned Weather Sensor'

In [71]:
# Clean temperature data
def clean_temp_data(file_path, output_path, sensor_id_col='Serial Number', temp_col='Air Temperature'):
    # Load the data
    print(f"Working on file: {os.path.basename(file_path)}")
    df = pd.read_csv(file_path)
    
    # Get initial row count
    starting_rows = len(df)
    
    # Remove missing temps
    df = df.dropna(subset=[temp_col])
    print(f"Dropped {starting_rows - len(df)} rows with missing temperatures")
    
    # Set up outlier detection with IQR method
    q1 = df[temp_col].quantile(0.25)
    q3 = df[temp_col].quantile(0.75)
    iqr = q3 - q1
    
    # Calculate bounds - using standard 1.5 * IQR
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    
    # Filter out the temperature outliers
    good_data = df[(df[temp_col] >= lower) & (df[temp_col] <= upper)]
    
    # Save cleaned data
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    good_data.to_csv(output_path, index=False)
    
    return good_data

In [75]:
# Process all sensor files
def process_sensor_files(in_folder, out_folder):
    # Check input folder exists
    if not os.path.exists(in_folder):
        print(f"Input folder not found: {in_folder}")
        return
    
    # Get all csv files
    all_files = glob.glob(os.path.join(in_folder, "*.csv"))
    
    if len(all_files) == 0:
        print(f"No CSV files found in {in_folder}")
        return
    
    # Sort them in order
    all_files.sort()
    print(f"Found {len(all_files)} files to process")
    
    # Keep track of results
    results = []
    
    # Process each file
    for f in all_files:
        filename = os.path.basename(f)
        out_file = os.path.join(out_folder, f"clean_{filename}")
        
        try:
            # Do the cleaning
            df_clean = clean_temp_data(f, out_file)
            
            # Record stats
            results.append({
                'file': filename,
                'original_rows': len(pd.read_csv(f)),
                'clean_rows': len(df_clean),
                'kept_pct': round(len(df_clean) / len(pd.read_csv(f)) * 100, 1)
            })
            
        except Exception as e:
            print(f"Error processing {filename}: {e}")
        
        print("---")

In [79]:
# Part 4: Main code to run - this should be the last part
if __name__ == "__main__":
    # Use the global variables defined at the top
    process_sensor_files(input_folder, output_folder)

Working on file: 2020_12.csv
Dropped 7 rows with missing temperatures
---
Working on file: 2020_4.csv
Dropped 0 rows with missing temperatures
---
Working on file: 2020_5.csv
Dropped 0 rows with missing temperatures
---
Working on file: 2021_10.csv
Dropped 1 rows with missing temperatures
---
Working on file: 2021_2.csv
Dropped 5 rows with missing temperatures
---
Working on file: 2021_3.csv
Dropped 5 rows with missing temperatures
---
Working on file: 2021_4.csv
Dropped 3 rows with missing temperatures
---
Working on file: 2021_6.csv
Dropped 0 rows with missing temperatures
---
Working on file: 2021_8.csv
Dropped 0 rows with missing temperatures
---
Working on file: 2021_9.csv
Dropped 0 rows with missing temperatures
---
Working on file: 2022_1.csv
Dropped 0 rows with missing temperatures
---
Working on file: 2022_10.csv
Dropped 25 rows with missing temperatures
---
Working on file: 2022_11.csv
Dropped 26 rows with missing temperatures
---
Working on file: 2022_12.csv
Dropped 25 rows 