In [1]:
import os
import pandas as pd
import dask.bag as db
from dask.distributed import Client
import shutil

# Define the directories and output path
directory = r'D:\SamsungSTF\Processed_Data\TripByTrip'
output_path = r'C:\Users\BSL\Desktop\result_files_acc.csv'
destination_directory = r'D:\SamsungSTF\Processed_Data\MissingData'

# Function to check if a file contains acceleration values >= 9.8
def check_file(filepath):
    try:
        df = pd.read_csv(filepath)
        if 'acceleration' in df.columns:
            if (df['acceleration'].abs() >= 9).any():
                return os.path.basename(filepath)
    except Exception as e:
        print(f"Error processing file {filepath}: {e}")
    return None

def main():
    # Start a Dask client with a specified port
    client = Client(dashboard_address=':7869')  # You can choose any available port here

    # Get list of all CSV files in the directory
    csv_files = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith(".csv")]

    # Create a Dask bag of the file paths
    file_bag = db.from_sequence(csv_files, npartitions=32)
    
    # Map the check_file function to the file paths
    results = file_bag.map(check_file).compute()

    # Filter out None results
    files_with_high_acceleration = [result for result in results if result]
    files_with_high_acceleration.sort()
    
    # Save results to a CSV file
    result_df = pd.DataFrame(files_with_high_acceleration, columns=['filename'])
    result_df.to_csv(output_path, index=False)

    # Move files with high acceleration to the destination directory
    for filename in files_with_high_acceleration:
        source_path = os.path.join(directory, filename)
        destination_path = os.path.join(destination_directory, filename)
        shutil.move(source_path, destination_path)

    return files_with_high_acceleration

if __name__ == "__main__":
    result_files = main()
    print(result_files)


['bms_01241124056-2024-04-trip-2.csv', 'bms_01241124056-2024-04-trip-5.csv', 'bms_01241228122-2023-02-trip-10.csv', 'bms_01241228122-2023-02-trip-16.csv', 'bms_01241228122-2023-02-trip-17.csv', 'bms_01241228122-2023-02-trip-23.csv', 'bms_01241228122-2023-02-trip-24.csv', 'bms_01241228122-2023-02-trip-26.csv', 'bms_01241228122-2023-02-trip-3.csv', 'bms_01241228122-2023-02-trip-32.csv', 'bms_01241228122-2023-02-trip-33.csv', 'bms_01241228122-2023-02-trip-37.csv', 'bms_01241228122-2023-02-trip-38.csv', 'bms_01241228122-2023-02-trip-44.csv', 'bms_01241228122-2023-02-trip-48.csv', 'bms_01241228122-2023-02-trip-49.csv', 'bms_01241228122-2023-02-trip-58.csv', 'bms_01241228122-2023-03-trip-23.csv', 'bms_01241228122-2023-03-trip-37.csv', 'bms_01241228122-2023-03-trip-38.csv', 'bms_01241228122-2023-03-trip-53.csv', 'bms_01241228122-2023-03-trip-8.csv', 'bms_01241228122-2023-04-trip-1.csv', 'bms_01241228122-2023-04-trip-16.csv', 'bms_01241228122-2023-04-trip-17.csv', 'bms_01241228122-2023-04-trip