In [2]:
import os
import pandas as pd
import dask.bag as db
from dask.distributed import Client

# Define the directory containing the CSV files
directory = r'D:\SamsungSTF\Processed_Data\TripByTrip'

# Function to check if a file contains acceleration values >= 9.8
def check_file(filepath):
    try:
        df = pd.read_csv(filepath)
        if 'acceleration' in df.columns:
            if (df['acceleration'].abs() >= 9).any():
                return os.path.basename(filepath)
    except Exception as e:
        print(f"Error processing file {filepath}: {e}")
    return None

def main():
    # Start a Dask client with a specified port
    client = Client(dashboard_address=':7868')  # You can choose any available port here

    # Get list of all CSV files in the directory
    csv_files = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith(".csv")]

    # Create a Dask bag of the file paths
    file_bag = db.from_sequence(csv_files, npartitions=10)
    
    # Map the check_file function to the file paths
    results = file_bag.map(check_file).compute()

    # Filter out None results
    files_with_high_acceleration = [result for result in results if result]

    return files_with_high_acceleration

if __name__ == "__main__":
    result_files = main()
    print(result_files)


['bms_01241228051-2023-05-trip-73.csv', 'bms_01241228087-2023-07-trip-31.csv', 'bms_01241228094-2023-01-trip-49.csv', 'bms_01241228094-2023-06-trip-31.csv', 'bms_01241228094-2023-06-trip-76.csv', 'bms_01241228094-2023-10-trip-19.csv', 'bms_01241228094-2023-10-trip-22.csv', 'bms_01241228094-2023-11-trip-100.csv', 'bms_01241228107-2023-01-trip-65.csv', 'bms_01241228122-2023-02-trip-10.csv', 'bms_01241228122-2023-02-trip-3.csv', 'bms_01241228122-2023-02-trip-30.csv', 'bms_01241228122-2023-02-trip-31.csv', 'bms_01241228122-2023-02-trip-36.csv', 'bms_01241228122-2023-03-trip-39.csv', 'bms_01241228122-2023-03-trip-40.csv', 'bms_01241228122-2023-03-trip-55.csv', 'bms_01241228122-2023-04-trip-1.csv', 'bms_01241228122-2023-04-trip-16.csv', 'bms_01241228122-2023-04-trip-17.csv', 'bms_01241228122-2023-04-trip-38.csv', 'bms_01241228122-2023-04-trip-39.csv', 'bms_01241228122-2023-04-trip-74.csv', 'bms_01241228122-2023-04-trip-75.csv', 'bms_01241228122-2023-05-trip-43.csv', 'bms_01241228122-2023-05-