In [6]:
import os
import pandas as pd
import numpy as np
import miceforest as mf
import shutil

In [None]:
# Count number io files with more than 70% non-NaN values in column 4
def count_non_nan_values(directory):
    # Initialize counter for files with more than 70 non-NaN values in column 4
    count = 0
    
    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path)
            
            # Count non-NaN values in column 4
            non_nan_count = df.iloc[:, 3].count()  # Assuming column 4 is indexed at 3 (0-based index)
            #print(non_nan_count)
            percentage=non_nan_count/35040 *100
            # Check if the count exceeds 70
            if percentage > 70:
                count += 1
    
    return count

# Directory containing CSV files
directory = 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2019'

# Call the function to count files with more than 70 non-NaN values in column 4
result = count_non_nan_values(directory)
print(f"Number of files with more than 70% non-NaN values in column 4: {result}")


In [None]:
# Impute missing values using MICE algorithm
# Scan CSV files in input directory
# For each file:
#   If >70% rows filled:
#     Impute missing values with MICE
#     Save imputed dataset to output directory
# Print status messages


def impute_missing_values(input_directory, output_directory):
    # Get all CSV files in the input directory
    file_paths = [os.path.join(input_directory, filename) for filename in os.listdir(input_directory) if filename.endswith('.csv')]
    
    # Process each CSV file
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        total_rows = 35040
        non_nan_count = df.iloc[:, 3].count()
        rows_with_data = non_nan_count
        
        # Calculate the percentage of filled rows
        filled_rows_percentage = (rows_with_data / total_rows) * 100.0
        
        # If less than 70 percent rows are available, skip imputation for this file
        if filled_rows_percentage < 70:
            print(f"Skipping imputation for '{file_path}'")
        else:
            # Convert DataFrame to numpy array
            data_array = df.drop(columns=['DateTimeStamp']).values
        
            # Create kernel
            kds = mf.ImputationKernel(data_array, save_all_iterations=True, random_state=1991)
        
            # Run the MICE algorithm for 15 iterations
            kds.mice(15)
        
            # Return the completed dataset
            completed_data = kds.complete_data()
        
            # Convert completed data back to DataFrame
            completed_df = pd.DataFrame(completed_data, columns=df.columns[1:])
            completed_df.insert(0, 'DateTimeStamp', df['DateTimeStamp'])
        
            # Replace '_emptyrows' with '_impute' in the output file name
            output_file_name = os.path.basename(file_path).replace('_emptyrows', '_impute')
        
            # Save the completed dataset to the output directory
            output_file_path = os.path.join(output_directory, output_file_name)
            completed_df.to_csv(output_file_path, index=False)
            print(f"Imputed values saved to '{output_file_path}'.")

# Input and output directory paths
input_directory = 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2018'
output_directory = 'E:/xie/Sensor Files/4. impute/full MICE imputated/2018'

# Call the function
impute_missing_values(input_directory, output_directory)


In [None]:
"""
Description:
This script fills missing values in CSV files with either 0 or -1 based on user preference. It processes each CSV file in the input directory, checks the percentage of filled rows, and if it meets the specified threshold, it fills missing values with the chosen value (0 or -1) and saves the completed datasets to the output directory.

Parameters:
- input_directory: The directory containing CSV files with missing values.
- output_directory: The directory where completed datasets will be saved after filling missing values.
- fill_value: The value used to fill missing values. Can be either 0 or -1.
"""

import os
import pandas as pd

def fill_missing_values(input_directory, output_directory, fill_value):
    # Get all CSV files in the input directory
    file_paths = [os.path.join(input_directory, filename) for filename in os.listdir(input_directory) if filename.endswith('.csv')]
    
    # Process each CSV file
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        total_rows = 35040
        non_nan_count = df.iloc[:, 3].count()
        rows_with_data = non_nan_count
        
        # Calculate the percentage of filled rows
        filled_rows_percentage = (rows_with_data / total_rows) * 100.0
        
        # If less than 70 percent rows are available, skip filling for this file
        if filled_rows_percentage < 70:
            print(f"Skipping filling for '{file_path}'")
        else:
            # Fill missing values with the specified fill value
            filled_df = df.fillna(fill_value)
        
            # Replace '_emptyrows' with '_filled' in the output file name
            output_file_name = os.path.basename(file_path).replace('_emptyrows', '_filled')
        
            # Save the filled dataset to the output directory
            output_file_path = os.path.join(output_directory, output_file_name)
            filled_df.to_csv(output_file_path, index=False)
            print(f"Filled values saved to '{output_file_path}'.")

# Input and output directory paths
input_directory = 'E:/xie/Sensor Files/3. filled w empty rows - fixed to 35040 rows/2018'
output_directory = 'E:/xie/Sensor Files/4. impute/full -1 imputated/2018'

# Specify the fill value (0 or -1)
fill_value = -1  # Change this to -1 if desired

# Call the function
fill_missing_values(input_directory, output_directory, fill_value)


In [13]:
# get common data files from both years and copy them to the destination folders

def get_common_csv_files(folder1, folder2):
    files_folder1 = set(file for file in os.listdir(folder1) if file.endswith('.csv'))
    files_folder2 = set(file for file in os.listdir(folder2) if file.endswith('.csv'))
    
    common_files = files_folder1.intersection(files_folder2)
    return common_files

def copy_files(source_folder, destination_folder, files_to_copy):
    for file in files_to_copy:
        source_path = os.path.join(source_folder, file)
        destination_path = os.path.join(destination_folder, file)
        shutil.copy2(source_path, destination_path)

# Replace these paths with your actual paths
folder1_path = 'E:/xie/Sensor Files/4. impute/full -1 imputated/2018'
folder2_path = 'E:/xie/Sensor Files/4. impute/full -1 imputated/2019'
destination_folder1_path = 'E:/xie/Sensor Files/4. impute/full -1 imputated Common Files/2018'
destination_folder2_path = 'E:/xie/Sensor Files/4. impute/full -1 imputated Common Files/2019'

common_files = get_common_csv_files(folder1_path, folder2_path)

# Copy common files to destination folders
copy_files(folder1_path, destination_folder1_path, common_files)
copy_files(folder2_path, destination_folder2_path, common_files)

print("Common CSV files copied successfully.")


Common CSV files copied successfully.
