Copy the notebook to the same folder as raw data. The filename is hard-coded for integrated_data now, so is the filtering settings via boolean logic in pandas. Check them before using it.

What it does:

1.Find the files with designated prefix in name.

2.Keep interested columns while dropping all others.

3.Drop rows with no value in interested values.

4.Remove dublicate entries by timestamp(optional).

In [None]:
import pandas as pd
import os
import logging


logging.basicConfig(
    filename='data_cleaning.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def clean_data(df):
    """
    Cleans the DataFrame based on specified conditions.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to clean.
    
    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    columns_to_keep = ['timestamp','sensor_2', 'sensor_4', 'mv_1', 'mv_2', 'mv_3', 'mv_4', 'fhp', 'prediction','notes','bad_posture_command','model_threshold','model_notes']
    df = df[columns_to_keep]

    condition = (
        (df['sensor_2'].notna() & df['sensor_4'].notna()) |  # Both sensor_2 and sensor_4 have values
        (df[['mv_1', 'mv_2', 'mv_3', 'mv_4']].notna().all(axis=1))  # All mv_1, mv_2, mv_3, mv_4 have values
    )


    cleaned_df = df[condition]
    return cleaned_df

def remove_duplicates(df):
    """
    Removes duplicate rows based on the 'timestamp' column.
    
    Parameters:
        df (pd.DataFrame): The DataFrame from which to remove duplicates.
    
    Returns:
        pd.DataFrame: The DataFrame with duplicates removed.
    """
    duplicates = df[df.duplicated(subset='timestamp', keep=False)]
    
    if not duplicates.empty:
        print("\nDuplicate rows:")
        print(duplicates)
        logging.info(f"Duplicate rows found:\n{duplicates}")
        
    df_before = df.shape[0]
    df = df.drop_duplicates(subset='timestamp', keep='first')
    df_after = df.shape[0]
    logging.info(f"Removed duplicates: {df_before - df_after} rows dropped.")
    return df

def process_files(directory, remove_duplicates = True):
    """
    Processes all CSV files in the specified directory. Remove duplicates as default option.
    
    Parameters:
        directory (str): The directory containing the CSV files.
    """
    for filename in os.listdir(directory):
        if filename.startswith('integrated_data') and filename.endswith('.csv'):

            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)

            original_row_count = df.shape[0]
            logging.info(f"Processing {filename}: Original row count: {original_row_count}")

            cleaned_df = clean_data(df)
            cleaned_df = remove_duplicates(cleaned_df)

            # Log the cleaned number of rows
            cleaned_row_count = cleaned_df.shape[0]
            logging.info(f"After cleaning {filename}: Cleaned row count: {cleaned_row_count}")

            cleaned_filename = f"cleaned_{filename}"
            cleaned_file_path = os.path.join(directory, cleaned_filename)
            cleaned_df.to_csv(cleaned_file_path, index=False)

            logging.info(f"Saved cleaned data to {cleaned_file_path}\n")





In [None]:
directory = './'  # Currently set to the same location
process_files(directory)