In [21]:
import os
import pandas as pd
import miceforest as mf

In [None]:
# Count number io files with more than 70% non-NaN values in column 4
def count_non_nan_values(directory):
    # Initialize counter for files with more than 70 non-NaN values in column 4
    count = 0
    
    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            
            # Count non-NaN values in column 4
            non_nan_count = df.iloc[:, 3].count()  # Assuming column 4 is indexed at 3 (0-based index)
            #print(non_nan_count)
            percentage=non_nan_count/35040 *100
            # Check if the count exceeds 70
            if percentage > 70:
                count += 1
    
    return count

# Directory containing CSV files
directory = 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2019'

# Call the function to count files with more than 70 non-NaN values in column 4
result = count_non_nan_values(directory)
print(f"Number of files with more than 70% non-NaN values in column 4: {result}")


In [37]:
import os
import pandas as pd
import numpy as np
import miceforest as mf

def impute_missing_values(input_directory, output_directory):
    # Get all CSV files in the input directory
    file_paths = [os.path.join(input_directory, filename) for filename in os.listdir(input_directory) if filename.endswith('.csv')]
    
    # Process each CSV file
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        total_rows = 35040
        non_nan_count = df.iloc[:, 3].count()
        rows_with_data = non_nan_count
        
        # Calculate the percentage of filled rows
        filled_rows_percentage = (rows_with_data / total_rows) * 100.0
        
        # If less than 70 percent rows are available, skip imputation for this file
        if filled_rows_percentage < 70:
            print(f"Skipping imputation for '{file_path}'")
        else:
            # Convert DataFrame to numpy array
            data_array = df.drop(columns=['DateTimeStamp']).values
        
            # Create kernel
            kds = mf.ImputationKernel(data_array, save_all_iterations=True, random_state=1991)
        
            # Run the MICE algorithm for 15 iterations
            kds.mice(15)
        
            # Return the completed dataset
            completed_data = kds.complete_data()
        
            # Convert completed data back to DataFrame
            completed_df = pd.DataFrame(completed_data, columns=df.columns[1:])
            completed_df.insert(0, 'DateTimeStamp', df['DateTimeStamp'])
        
            # Replace '_emptyrows' with '_impute' in the output file name
            output_file_name = os.path.basename(file_path).replace('_emptyrows', '_impute')
        
            # Save the completed dataset to the output directory
            output_file_path = os.path.join(output_directory, output_file_name)
            completed_df.to_csv(output_file_path, index=False)
            print(f"Imputed values saved to '{output_file_path}'.")

# Input and output directory paths
input_directory = 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2018'
output_directory = 'E:/xie/Sensor Files/4. impute/full MICE imputated/2018'

# Call the function
impute_missing_values(input_directory, output_directory)


Skipping imputation for 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2018\10.1.267_emptyrows.csv'
Imputed values saved to 'E:/xie/Sensor Files/4. impute/full MICE imputated/2018\10.2.268_impute.csv'.
Skipping imputation for 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2018\101.1.35_emptyrows.csv'
Skipping imputation for 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2018\102.1.40_emptyrows.csv'
Imputed values saved to 'E:/xie/Sensor Files/4. impute/full MICE imputated/2018\102.2.37_impute.csv'.
Skipping imputation for 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2018\103.1.40_emptyrows.csv'
Skipping imputation for 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2018\103.2.40_emptyrows.csv'
Imputed values saved to 'E:/xie/Sensor Files/4. impute/full MICE imputated/2018\109.1.37_impute.csv'.
Skipping imputation for 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2018\11.1