In [None]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')



Mounted at /content/drive/


In [None]:
import csv

# Specify the file path
csv_file = '/content/drive/MyDrive/0/xnas-itch-20220801.trades.csv'

# Open and read the CSV file
with open(csv_file, mode='r') as file:
    reader = csv.reader(file)

    # Get the headers (field names)
    headers = next(reader)
    print("Fields in the CSV file:")
    for field in headers:
        print(field)

 # Print the first few rows for debugging
    print(f"Preview of {filename}:")
    for i, row in enumerate(reader):
        if i < 5:  # Print only the first 5 rows
            print(row)
        filtered_row = {field: row[field] for field in fields_to_keep if field in row}
        writer.writerow(filtered_row)


Fields in the CSV file:
ts_recv
ts_event
rtype
publisher_id
instrument_id
action
side
depth
price
size
flags
ts_in_delta
sequence
symbol
Preview of xnas-itch-20220810.trades.csv:
['2022-08-01T08:00:13.027169890Z', '2022-08-01T08:00:13.027158303Z', '0', '2', '4837', 'T', 'B', '0', '116.430000000', '48', '130', '11587', '343931', 'GOOGL']


ValueError: I/O operation on closed file.

In [None]:
import csv
import os
from collections import defaultdict

# Specify the folder containing the CSV files
input_folder = "/content/drive/MyDrive/0"  # Replace with your actual path
output_file = "combined_output_transformed.csv"

# Validate input folder
if not os.path.exists(input_folder):
    print(f"Error: Input folder '{input_folder}' does not exist.")
    exit(1)

# List of fields to keep
fields_to_keep = ["size", "price", "ts_event", "side", "symbol"]

# Initialize a dictionary to aggregate data
aggregated_data = defaultdict(lambda: {
    "size": 0,
    "price_sum": 0,
    "price_count": 0,
    "symbol": None,
    "prices": []
})

# Process each CSV file in the folder
for filename in os.listdir(input_folder):
    file_path = os.path.join(input_folder, filename)

    # Skip non-CSV files
    if not filename.endswith(".csv") or not os.path.isfile(file_path):
        continue

    with open(file_path, mode='r') as input_csv:
        reader = csv.DictReader(input_csv)

        # Aggregate rows by ts_event and side
        for row in reader:
            if all(field in row for field in fields_to_keep):
                ts_event = row["ts_event"].split('T')[0] if 'T' in row["ts_event"] else row["ts_event"]
                side = row["side"]
                key = (ts_event, side)

                price = float(row["price"])
                size = float(row["size"])

                aggregated_data[key]["size"] += size
                aggregated_data[key]["price_sum"] += price
                aggregated_data[key]["price_count"] += 1
                aggregated_data[key]["prices"].append(price)
                aggregated_data[key]["symbol"] = row["symbol"]  # Assume the symbol is the same for each key

# Write the transformed data to the output file
with open(output_file, mode='w', newline='') as output_csv:
    writer = csv.DictWriter(output_csv, fieldnames=["ts_event", "side", "size", "price", "symbol", "range"])
    writer.writeheader()

    for (ts_event, side), data in aggregated_data.items():
        # Calculate mean price
        mean_price = data["price_sum"] / data["price_count"]

        # Calculate aggregated data for each range
        for range_label, price_filter in [
            ("mean_range", lambda p: abs(p - mean_price) <= 0.5),
            ("below_mean_range", lambda p: abs(p - (mean_price - 1)) <= 0.5),
            ("above_mean_range", lambda p: abs(p - (mean_price + 1)) <= 0.5)
        ]:
            filtered_prices = [p for p in data["prices"] if price_filter(p)]
            filtered_size = sum([data["size"] for i, price in enumerate(data["prices"]) if price_filter(price)])

            if filtered_prices:
                writer.writerow({
                    "ts_event": ts_event,
                    "side": side,
                    "size": filtered_size,
                    "price": sum(filtered_prices) / len(filtered_prices),
                    "symbol": data["symbol"],
                    "range": range_label
                })

print(f"Transformed CSV file saved as: {output_file}")

Transformed CSV file saved as: combined_output_transformed.csv
