# This simple snippet is to read the crypto_data folder and combine the checkpoints plus the final 2h dataset to a single dataset

In [1]:
import os
import pandas as pd


In [5]:
folder_path = 'crypto_data'
if not os.path.exists(folder_path):
    raise FileNotFoundError(f"The folder '{folder_path}' does not exist.")
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
if not csv_files:
    raise ValueError("No CSV files found in the specified folder.")
csv_files_sorted = sorted(
    csv_files,
    key=lambda x: int(x.split('_checkpoint_')[1].split('_')[0]) if 'checkpoint' in x else float('inf')
)
combined_df = pd.DataFrame()
for file in csv_files_sorted:
    file_path = os.path.join(folder_path, file)
    try:
        temp_df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
    except Exception as e:
        print(f"Error reading file {file}: {e}")
print("Combined DataFrame:")
print(combined_df.head(10))

Combined DataFrame:
                 timestamp  bid_price  ask_price  trade_price       volume  \
0  2025-04-15 18:22:57.214   85675.98   85675.99     85675.98  4717.319460   
1  2025-04-15 18:22:57.225   85675.98   85675.99     85675.98  4717.319460   
2  2025-04-15 18:22:57.229   85675.98   85675.99     85675.98  4717.319460   
3  2025-04-15 18:22:57.231   85675.98   85675.99     85675.98  4717.319460   
4  2025-04-15 18:22:57.232   85675.98   85675.99     85675.98  4717.319460   
5  2025-04-15 18:22:57.237   85675.98   85675.99     85675.98  4717.319460   
6  2025-04-15 18:22:57.250   85675.98   85675.99     85675.99  4717.319460   
7  2025-04-15 18:22:57.314   85675.98   85675.99     85675.99     0.000000   
8  2025-04-15 18:22:57.414   85675.98   85851.77     85675.98    37.697433   
9  2025-04-15 18:22:57.444   85675.98   85851.77     85675.98    37.697433   

   mid_price  
0  85675.985  
1  85675.985  
2  85675.985  
3  85675.985  
4  85675.985  
5  85675.985  
6  85675.985  
7

# Resampling to 1s for rapid check of a trend at a particular duration ie 2 hours


In [None]:
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'])
combined_df = combined_df.set_index('timestamp')
resampled_df = combined_df.resample('1s').agg({
    'bid_price': 'last',      # Last bid price in the second
    'ask_price': 'last',      # Last ask price in the second
    'trade_price': 'last',    # Last trade price in the second
    'volume': 'sum',          # Sum of volume within the second
    'mid_price': 'last'       # Last mid price in the second
})

resampled_df = resampled_df.reset_index()

# Show the result
print(resampled_df.head())

# Check the reduction in rows
print(f"Original rows: {len(combined_df)}")
print(f"Resampled rows: {len(resampled_df)}")



            timestamp  bid_price  ask_price  trade_price        volume  \
0 2025-04-15 18:22:57   85675.98   85675.99     85675.99  33134.328517   
1 2025-04-15 18:22:58   85675.98   85672.76     85672.75  15856.344824   
2 2025-04-15 18:22:59   85672.75   85672.76     85672.76  30159.381701   
3 2025-04-15 18:23:00   85590.49   85672.76     85672.76  29541.681057   
4 2025-04-15 18:23:01   85670.00   85672.76     85675.23  44950.784471   

   mid_price  
0  85675.985  
1  85674.370  
2  85672.755  
3  85631.625  
4  85671.380  
Original rows: 120677
Resampled rows: 7200


In [9]:
# Save the 1s data : 
resampled_df.to_csv('HFT_2_hr_combined_crypto_data_1s.csv', index=False)

In [8]:
# Save the combined DataFrame to a new CSV file
combined_df.to_csv('HFT_2_hr_combined_crypto_data.csv', index=False)