In [None]:
# Import necessary libraries
import os
import random
import numpy as np
import pandas as pd

In [None]:
def create_and_transform_csv(input_folder, output_folder, volatility_window=10):
    """
    Create and transform CSVs for S&P500 companies from Parquet files, including:
    - Renaming columns: X.Open -> bid_price, X.High -> bid_quantity, X.Low -> ask_price, X.Close -> ask_quantity.
    - Splitting 'index' into 'date' and 'time_utc', then removing 'index'.
    - Adding calculated columns: mid_price, order_density, spread, log_return, volatility.

    Args:
        input_folder (str): Path to the folder containing company folders with Parquet files.
        output_folder (str): Path to save the generated and transformed CSV files.
        volatility_window (int): Rolling window size for volatility calculation.

    Returns:
        None: Creates and saves transformed CSV files in the output folder.
    """

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # List all folders (tickers) in the input directory
    tickers = [d for d in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, d))]

    for ticker in tickers:
        ticker_folder = os.path.join(input_folder, ticker)
        output_csv = os.path.join(output_folder, f"{ticker}_2010_cleaned.csv")

        print(f"Processing ticker: {ticker}")

        # List all Parquet files in the ticker's folder
        parquet_files = [
            os.path.join(ticker_folder, f) for f in os.listdir(ticker_folder)
            if f.endswith('.parquet') and f.startswith('2010')
        ]

        if not parquet_files:
            print(f"No Parquet files for 2010 found for ticker: {ticker}")
            continue

        # Read and concatenate all 2010 Parquet files
        dataframes = []
        for parquet_file in sorted(parquet_files):  # Ensure files are processed in date order
            df = pd.read_parquet(parquet_file)
            dataframes.append(df)

        # Concatenate all DataFrames and transform
        if dataframes:
            full_dataframe = pd.concat(dataframes, ignore_index=False)

            # Convert the 'index' column to datetime
            full_dataframe['index'] = pd.to_datetime(full_dataframe['index'], utc=True, errors='coerce')

            # Extract 'date' and 'time_utc' columns
            full_dataframe['date'] = full_dataframe['index'].dt.date
            full_dataframe['time_utc'] = full_dataframe['index'].dt.strftime('%H:%M:%S.%f').str.rstrip('0').str.rstrip('.')
            full_dataframe.drop(columns=['index'], inplace=True)

            # Rename columns
            full_dataframe.rename(
                columns={
                    'X.Open': 'bid_price',
                    'X.High': 'bid_quantity',
                    'X.Low': 'ask_price',
                    'X.Close': 'ask_quantity'
                },
                inplace=True
            )

            # Reorder columns
            cols = ['date', 'time_utc'] + [col for col in full_dataframe.columns if col not in ['date', 'time_utc']]
            full_dataframe = full_dataframe[cols]

            # Add calculated columns
            full_dataframe['mid_price'] = (full_dataframe['bid_price'] + full_dataframe['ask_price']) / 2
            full_dataframe['order_density'] = full_dataframe['bid_quantity'] + full_dataframe['ask_quantity']
            full_dataframe['spread'] = full_dataframe['ask_price'] - full_dataframe['bid_price']

            # Replace zero or negative mid_price with NaN
            full_dataframe.loc[full_dataframe['mid_price'] <= 0, 'mid_price'] = np.nan

            # Calculate log_return, handling NaN or Inf gracefully
            full_dataframe['log_return'] = np.log(full_dataframe['mid_price'] / full_dataframe['mid_price'].shift(1))
            full_dataframe['log_return'] = full_dataframe['log_return'].replace([np.inf, -np.inf], np.nan)

            # Calculate rolling volatility
            full_dataframe['volatility'] = (
                full_dataframe['log_return']
                .rolling(window=volatility_window, min_periods=1)
                .std()
            )

            # Save the transformed DataFrame to CSV
            full_dataframe.to_csv(output_csv, index=False)
            print(f"Transformed CSV created for {ticker}: {output_csv}")
        else:
            print(f"No data for 2010 for ticker: {ticker}")

In [None]:
if __name__ == "__main__":
    # Paths for input and output directories
    input_folder = "S&P500"  # Folder containing company subfolders
    output_folder = "S&P500_cleaned"  # Output folder for CSV files

    # Create and transform CSV files for all tickers
    create_and_transform_csv(input_folder, output_folder)

In [58]:
def display_random_cleaned_csv(folder_path):
    """
    Randomly selects and displays a cleaned CSV file from the specified folder.

    Args:
        folder_path (str): Path to the folder containing cleaned CSV files.

    Returns:
        None: Displays the content of the randomly selected CSV.
    """
    # List all files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    if not csv_files:
        print("No CSV files found in the specified folder.")
        return

    # Randomly select a CSV file
    random_csv = random.choice(csv_files)
    csv_path = os.path.join(folder_path, random_csv)

    # Load and display the CSV
    print(f"Displaying CSV for {random_csv}:")
    df = pd.read_csv(csv_path)
    display(df)

# Specify the folder containing cleaned CSV files
cleaned_folder = "S&P500_cleaned"

# Display a random cleaned CSV
display_random_cleaned_csv(cleaned_folder)

Displaying CSV for AMT_2010_cleaned.csv:


Unnamed: 0,date,time_utc,bid_price,bid_quantity,ask_price,ask_quantity,mid_price,order_density,spread,log_return,volatility
0,2010-01-04,14:31:00,43.27,7.0,43.50,4.0,43.385,11.0,0.23,,
1,2010-01-04,14:32:00,43.51,28.0,43.40,8.0,43.455,36.0,-0.11,0.001612,
2,2010-01-04,14:33:00,43.45,27.0,43.46,3.0,43.455,30.0,0.01,0.000000,0.001140
3,2010-01-04,14:34:00,43.48,21.0,43.47,3.0,43.475,24.0,-0.01,0.000460,0.000830
4,2010-01-04,14:35:00,43.50,16.0,43.52,4.0,43.510,20.0,0.02,0.000805,0.000680
...,...,...,...,...,...,...,...,...,...,...,...
97467,2010-12-31,20:56:00,51.63,28.0,51.60,50.0,51.615,78.0,-0.03,-0.000484,0.000434
97468,2010-12-31,20:57:00,51.59,30.0,51.60,25.0,51.595,55.0,0.01,-0.000388,0.000448
97469,2010-12-31,20:58:00,51.60,34.0,51.61,36.0,51.605,70.0,0.01,0.000194,0.000431
97470,2010-12-31,20:59:00,51.62,36.0,51.63,39.0,51.625,75.0,0.01,0.000387,0.000429
