In [1]:
import pandas as pd
import os

# Specify the directory path
directory_path = './raw_data'

# List all files in the directory
all_files = os.listdir(directory_path)

# Filter out files that end with .csv
csv_files = [file for file in all_files if file.endswith('.csv')]

# Read the CSV file
df = pd.read_csv(os.path.join(directory_path, csv_files[0]))
df['datetime'] = pd.to_datetime(df['datetime'])
df.head()


Unnamed: 0,index,close,open,high,low,volume,datetime,time
0,57,143.53,143.53,143.53,143.5,275300.0,2000-01-04 09:35:00,0.128205
1,58,143.25,143.56,143.78,143.22,124000.0,2000-01-04 09:40:00,0.25641
2,59,143.56,143.31,143.66,143.22,89200.0,2000-01-04 09:45:00,0.384615
3,60,143.94,143.5,144.06,143.5,67600.0,2000-01-04 09:50:00,0.512821
4,61,143.97,144.0,144.06,143.69,145100.0,2000-01-04 09:55:00,0.641026


In [2]:
from utils.augment_price_data import augment_price_data

In [3]:
#!pip install pandas_market_calendars


In [4]:
# Get market opening days
import pandas as pd 
from pandas_market_calendars import get_calendar
# Get the NYSE calendar
nyse = get_calendar("XNYS")
# Get the valid trading days for the specified date range
trading_days = nyse.valid_days(start_date='2000-01-04', end_date='2023-12-22').date
len(trading_days)

6032

In [6]:
from tqdm import tqdm

# function generates a synthesized data
def obtain_syn_data(original_df:pd.DataFrame, lam:float) -> pd.DataFrame:
    synth_frames = []
    for timestamp in tqdm(trading_days):
        original_df_day = original_df[original_df['datetime'].dt.date == timestamp]
        original_df_day_synth = augment_price_data(original_df_day, lam=lam)
        synth_frames.append(original_df_day_synth)

    # Concatenate all DataFrames in the list
    df_synth = pd.concat(synth_frames, axis=0)

    # Optionally reset the index if needed
    df_synth.reset_index(drop=True, inplace=True)
    # Both have the same dates, volume, time
    df_synth[['datetime', 'volume']] = original_df[['datetime', 'volume']]

    print(f"Augmented: {len(df_synth)}")
    print(f"Original: {len(original_df)}")

    return df_synth

#spy_df_synth = obtain_syn_data(df, lam=0.5)
#spy_df_synth

In [7]:
num_of_aug_sets = 2

for i in tqdm(range(num_of_aug_sets)):
    spy_df_synth = obtain_syn_data(df, lam=0.5)
    df.to_csv(f'aug_{i}.csv', index=False)
    

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/6032 [00:00<?, ?it/s][A
  0%|          | 1/6032 [00:00<44:13,  2.27it/s][A
  0%|          | 2/6032 [00:00<43:51,  2.29it/s][A
  0%|          | 3/6032 [00:01<43:06,  2.33it/s][A
  0%|          | 4/6032 [00:01<42:35,  2.36it/s][A
  0%|          | 5/6032 [00:02<42:07,  2.38it/s][A
  0%|          | 6/6032 [00:02<39:41,  2.53it/s][A
  0%|          | 7/6032 [00:02<33:53,  2.96it/s][A
  0%|          | 8/6032 [00:02<30:11,  3.33it/s][A
  0%|          | 9/6032 [00:03<27:07,  3.70it/s][A
  0%|          | 10/6032 [00:03<25:03,  4.01it/s][A
  0%|          | 11/6032 [00:03<23:36,  4.25it/s][A
  0%|          | 12/6032 [00:03<22:37,  4.43it/s][A
  0%|          | 13/6032 [00:03<21:59,  4.56it/s][A
  0%|          | 14/6032 [00:04<21:35,  4.64it/s][A
  0%|          | 15/6032 [00:04<21:19,  4.70it/s][A
  0%|          | 16/6032 [00:04<21:05,  4.76it/s][A
  0%|          | 17/6032 [00:04<20:53,  4.80it/s][A
  0%|          | 18/6032 [

KeyboardInterrupt: 