In [2]:
import os
import glob
import pandas as pd
import numpy as np

# Path to the main directory containing CSV files
main_directory = '../data'

# Recursively find all CSV files
csv_files = glob.glob(os.path.join(main_directory, '**/*.csv'), recursive=True)

# Total final sample size
total_sample_size = 100000

# Calculate sample size per file
sample_size_per_file = max(1, total_sample_size // len(csv_files))

# Function to get random sample from a file
def get_random_sample_from_file(file, sample_size, chunksize=10000):
    # Determine the number of rows in the file
    n = sum(1 for line in open(file)) - 1  # Excluding header

    # Number of rows to skip, adjust sample size if the file is smaller
    skip = max(0, n - sample_size)
    
    # Use random_state for reproducibility, set chunksize to manage memory
    iterator = pd.read_csv(file, chunksize=chunksize, skiprows=lambda x: x > 0 and np.random.rand() > skip / n)

    # Iterate over chunks and concatenate a sample from each
    sample = pd.concat([chunk.sample(min(len(chunk), sample_size // (n // chunksize + 1))) for chunk in iterator])

    return sample

# List to store sampled DataFrames
sampled_dataframes = []

print("Beginning sampling process")

# Sample from each file
for file in csv_files:
    try:
        sample = get_random_sample_from_file(file, sample_size_per_file)
        sampled_dataframes.append(sample)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Combine the samples
combined_sample = pd.concat(sampled_dataframes, ignore_index=True)

# Final sampling if the combined sample is larger than the desired total sample size
if len(combined_sample) > total_sample_size:
    combined_sample = combined_sample.sample(n=total_sample_size, random_state=np.random.RandomState())

# Output file for the sample
output_file = 'combined_sampled_data.csv'

# Write the sample to a new CSV file
combined_sample.to_csv(output_file, index=False)

print(f"Sampled data written to {output_file}")


Beginning sampling process
Error processing file ../data\20221108-20221205\20221108-20221205\20221111.csv: Error tokenizing data. C error: no error message set
Error processing file ../data\20230904-20231002\20230904-20231002\20230921.csv: Error tokenizing data. C error: no error message set
