In [1]:
import os
import pandas as pd
import dask.bag as db
from dask.distributed import Client
import shutil

# Define the directories and output path
directory = r'D:\SamsungSTF\Processed_Data\TripByTrip'
output_path = r'C:\Users\BSL\Desktop\result_files_nan.csv'
destination_directory = r'D:\SamsungSTF\Processed_Data\MissingData_NAN'

# Function to check if a file contains NaN values in specified columns
def check_file(filepath):
    try:
        df = pd.read_csv(filepath)
        if any(col in df.columns for col in ['speed', 'acceleration', 'Power_IV', 'Power']):
            if df[['speed', 'acceleration', 'Power_IV', 'Power']].isnull().any().any():
                return os.path.basename(filepath)
    except Exception as e:
        print(f"Error processing file {filepath}: {e}")
    return None

def main():
    # Start a Dask client with a specified port
    client = Client(dashboard_address=':7869')  # You can choose any available port here

    # Get list of all CSV files in the directory
    csv_files = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith(".csv")]

    # Create a Dask bag of the file paths
    file_bag = db.from_sequence(csv_files, npartitions=32)
    
    # Map the check_file function to the file paths
    results = file_bag.map(check_file).compute()

    # Filter out None results
    files_with_nan_values = [result for result in results if result]
    files_with_nan_values.sort()
    
    # Save results to a CSV file
    result_df = pd.DataFrame(files_with_nan_values, columns=['filename'])
    result_df.to_csv(output_path, index=False)

    # Move files with NaN values to the destination directory
    for filename in files_with_nan_values:
        source_path = os.path.join(directory, filename)
        if not os.path.exists(destination_directory):
            os.makedirs(destination_directory)
        destination_path = os.path.join(destination_directory, filename)
        shutil.move(source_path, destination_path)
    
    client.close()
    return files_with_nan_values

if __name__ == "__main__":
    result_files = main()
    print(result_files)


[]
