## Configs

In [1]:
pip install scikit-learn pandas yfinance ta tqdm tables

Note: you may need to restart the kernel to use updated packages.




In [2]:
import warnings
from sklearn.model_selection import train_test_split
import pandas as pd
import yfinance as yf
import os, contextlib
import pandas as pd
from ta import add_all_ta_features
from tqdm import tqdm
import shutil
from os.path import isfile, join

warnings.filterwarnings('ignore')
offset = 0
limit = None
period = 'max' # valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max

## Download all NASDAQ traded symbols

In [3]:
data = pd.read_csv("http://www.nasdaqtrader.com/dynamic/SymDir/nasdaqtraded.txt", sep='|')
data_clean = data[data['Test Issue'] == 'N']
symbols = data_clean['NASDAQ Symbol'].tolist()
print('total number of symbols traded = {}'.format(len(symbols)))

total number of symbols traded = 11267


## Download Historic data and add_all_ta

In [4]:
! mkdir hist

A subdirectory or file hist already exists.


In [5]:
! mkdir crypto

A subdirectory or file crypto already exists.


### Stocks

In [6]:
def optimize_dtypes(df):
    # This function will optimize the data types in a pandas DataFrame
    for col in df.columns:
        col_data = df[col]
        dn = col_data.dtype.name
        if dn == "object":
            df[col] = pd.to_numeric(col_data.astype("category").cat.codes, downcast="integer")
        elif dn == "bool":
            df[col] = col_data.astype("int8")
        elif dn.startswith("int") or (col_data.round() == col_data).all():
            df[col] = pd.to_numeric(col_data, downcast="integer")
        else:
            df[col] = pd.to_numeric(col_data, downcast='float')
    return df

def add_ticker_to_columns(df, ticker):
    # This function will add the ticker name to the column names
    df.columns = [f"{ticker}_{col}" for col in df.columns]
    return df

limit = limit if limit else len(symbols)
end = min(offset + limit, len(symbols))
is_valid = [False] * len(symbols)
window = 10  # replace this with the window size you're using for ATR

def add_all_ta_features_with_exceptions_and_cleanup(data):
    try:
        # Add all technical indicators
        data_with_ta = add_all_ta_features(data, open="Open", high="High", low="Low", close="Close", volume="Volume")
        
        # Identify columns to remove
        columns_to_remove = []
        for col in data_with_ta.columns:
            # Check for NaN values between the first 30 and last 30 rows
            if data_with_ta.iloc[30:-30][col].isna().any():
                columns_to_remove.append(col)
                
        # Remove identified columns
        data_with_ta.drop(columns=columns_to_remove, inplace=True)
        
        return data_with_ta
    
    except Exception as e:
        print(f"Error adding technical indicators. Error: {e}")
        return data

with open(os.devnull, 'w') as devnull:
    with contextlib.redirect_stdout(devnull):
        for i in tqdm(range(offset, end)):
            s = symbols[i]
            if not isinstance(s, str):  # Skip non-string symbols
                print(f"Invalid symbol: {s}. Skipping.")
                continue
            data = yf.download(s, period=period)
            if len(data) < 61:  # Skip if less than 61 lines of data
                print(f"Insufficient data for symbol: {s}. Skipping.")
                continue

            data = optimize_dtypes(data)  # Optimize the data types
            data = add_all_ta_features_with_exceptions_and_cleanup(data)  # Add technical indicators
            data = add_ticker_to_columns(data, s)  # Add the ticker to column names

            is_valid[i] = True
            data.to_hdf('hist/{}.h5'.format(s), key='df', mode='w')  # Save as HDF5

print('Total number of valid symbols downloaded = {}'.format(sum(is_valid)))

  0%|                                                                              | 5/11267 [00:15<6:57:56,  2.23s/it]
1 Failed download:
['AAC=']: Exception('%ticker%: No timezone found, symbol may be delisted')
  0%|                                                                              | 6/11267 [00:16<4:58:36,  1.59s/it]
1 Failed download:
['AAC+']: Exception('%ticker%: No timezone found, symbol may be delisted')
  0%|                                                                             | 12/11267 [00:21<2:30:26,  1.25it/s]
1 Failed download:
['AACT=']: Exception('%ticker%: No timezone found, symbol may be delisted')
  0%|                                                                             | 13/11267 [00:22<2:58:24,  1.05it/s]
1 Failed download:
['AACT+']: Exception('%ticker%: No timezone found, symbol may be delisted')
  0%|                                                                             | 17/11267 [00:29<6:06:11,  1.95s/it]
1 Failed download:
['A

  3%|██                                                                          | 309/11267 [08:43<4:07:01,  1.35s/it]
1 Failed download:
['AHT-F']: Exception('%ticker%: No timezone found, symbol may be delisted')
  3%|██                                                                          | 310/11267 [08:43<3:09:17,  1.04s/it]
1 Failed download:
['AHT-G']: Exception('%ticker%: No timezone found, symbol may be delisted')
  3%|██                                                                          | 311/11267 [08:43<2:29:45,  1.22it/s]
1 Failed download:
['AHT-H']: Exception('%ticker%: No timezone found, symbol may be delisted')
  3%|██                                                                          | 312/11267 [08:44<2:00:19,  1.52it/s]
1 Failed download:
['AHT-I']: Exception('%ticker%: No timezone found, symbol may be delisted')
  3%|██▏                                                                         | 324/11267 [09:00<7:56:37,  2.61s/it]
1 Failed download:
[

  7%|█████▎                                                                      | 791/11267 [22:57<4:30:55,  1.55s/it]
1 Failed download:
['ATH-B']: Exception('%ticker%: No timezone found, symbol may be delisted')
  7%|█████▎                                                                      | 792/11267 [22:57<3:26:48,  1.18s/it]
1 Failed download:
['ATH-C']: Exception('%ticker%: No timezone found, symbol may be delisted')
  7%|█████▎                                                                      | 793/11267 [22:58<3:02:40,  1.05s/it]
1 Failed download:
['ATH-D']: Exception('%ticker%: No timezone found, symbol may be delisted')
  7%|█████▎                                                                      | 794/11267 [22:58<2:24:12,  1.21it/s]
1 Failed download:
['ATH-E']: Exception('%ticker%: No timezone found, symbol may be delisted')
  7%|█████▋                                                                      | 838/11267 [24:28<9:06:02,  3.14s/it]
1 Failed download:
[

 11%|████████▏                                                                  | 1238/11267 [36:21<2:48:34,  1.01s/it]
1 Failed download:
['BITE+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 11%|████████▍                                                                  | 1274/11267 [37:22<7:28:57,  2.70s/it]
1 Failed download:
['BKKT+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 11%|████████▌                                                                  | 1285/11267 [37:45<5:40:25,  2.05s/it]
1 Failed download:
['BKSY+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 12%|████████▊                                                                  | 1331/11267 [39:02<3:24:51,  1.24s/it]
1 Failed download:
['BLUA=']: Exception('%ticker%: No timezone found, symbol may be delisted')
 12%|████████▊                                                                  | 1332/11267 [39:03<2:37:48,  1.05it/s]
1 Failed download:
[

 17%|████████████▍                                                              | 1860/11267 [54:36<6:28:23,  2.48s/it]
1 Failed download:
['CEQP-']: Exception('%ticker%: No timezone found, symbol may be delisted')
 17%|████████████▌                                                              | 1890/11267 [55:25<3:40:06,  1.41s/it]
1 Failed download:
['CFG-D']: Exception('%ticker%: No timezone found, symbol may be delisted')
 17%|████████████▌                                                              | 1891/11267 [55:26<3:04:13,  1.18s/it]
1 Failed download:
['CFG-E']: Exception('%ticker%: No timezone found, symbol may be delisted')
 17%|████████████▋                                                              | 1899/11267 [55:38<6:15:35,  2.41s/it]
1 Failed download:
['CFR-B']: Exception('%ticker%: No timezone found, symbol may be delisted')
 17%|█████████████                                                              | 1963/11267 [57:31<6:09:24,  2.38s/it]
1 Failed download:
[

 21%|███████████████▍                                                         | 2378/11267 [1:11:03<2:46:46,  1.13s/it]
1 Failed download:
['CSTA=']: Exception('%ticker%: No timezone found, symbol may be delisted')
 21%|███████████████▍                                                         | 2379/11267 [1:11:05<2:55:45,  1.19s/it]
1 Failed download:
['CSTA+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 21%|███████████████▍                                                         | 2390/11267 [1:11:30<6:05:43,  2.47s/it]
1 Failed download:
['CTA-A']: Exception('%ticker%: No timezone found, symbol may be delisted')
 21%|███████████████▍                                                         | 2391/11267 [1:11:30<4:30:24,  1.83s/it]
1 Failed download:
['CTA-B']: Exception('%ticker%: No timezone found, symbol may be delisted')
 21%|███████████████▌                                                         | 2411/11267 [1:12:14<6:23:08,  2.60s/it]
1 Failed download:
[

 27%|████████████████████                                                     | 3097/11267 [1:33:01<2:43:02,  1.20s/it]
1 Failed download:
['EFC-C']: Exception('%ticker%: No timezone found, symbol may be delisted')
 28%|████████████████████▏                                                    | 3125/11267 [1:34:03<5:39:51,  2.50s/it]
1 Failed download:
['EGGF=']: Exception('%ticker%: No timezone found, symbol may be delisted')
 28%|████████████████████▎                                                    | 3126/11267 [1:34:03<4:11:28,  1.85s/it]
1 Failed download:
['EGGF+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 28%|████████████████████▌                                                    | 3169/11267 [1:35:42<3:30:25,  1.56s/it]
1 Failed download:
['ELIQ+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 29%|█████████████████████▏                                                   | 3278/11267 [1:39:42<5:44:37,  2.59s/it]
1 Failed download:
[

 35%|█████████████████████████▋                                               | 3973/11267 [1:57:52<2:11:11,  1.08s/it]
1 Failed download:
['FREY+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 35%|█████████████████████████▉                                               | 3994/11267 [1:58:23<4:34:01,  2.26s/it]
1 Failed download:
['FRT-C']: Exception('%ticker%: No timezone found, symbol may be delisted')
 35%|█████████████████████████▉                                               | 3998/11267 [1:58:27<2:49:20,  1.40s/it]
1 Failed download:
['FRXB=']: Exception('%ticker%: No timezone found, symbol may be delisted')
 35%|█████████████████████████▉                                               | 3999/11267 [1:58:28<2:08:55,  1.06s/it]
1 Failed download:
['FRXB+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 36%|██████████████████████████                                               | 4021/11267 [1:58:54<2:12:29,  1.10s/it]
1 Failed download:
[

 40%|█████████████████████████████▏                                           | 4506/11267 [2:10:29<3:24:40,  1.82s/it]
1 Failed download:
['GRBK-A']: Exception('%ticker%: No timezone found, symbol may be delisted')
 40%|█████████████████████████████▎                                           | 4524/11267 [2:10:55<2:20:42,  1.25s/it]
1 Failed download:
['GRND+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 40%|█████████████████████████████▎                                           | 4533/11267 [2:11:05<2:39:18,  1.42s/it]
1 Failed download:
['GROY+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 40%|█████████████████████████████▍                                           | 4534/11267 [2:11:06<2:04:23,  1.11s/it]
1 Failed download:
['GRP=']: Exception('%ticker%: No timezone found, symbol may be delisted')
 40%|█████████████████████████████▍                                           | 4546/11267 [2:11:21<3:16:29,  1.75s/it]
1 Failed download:
[

 47%|██████████████████████████████████▍                                      | 5310/11267 [2:30:12<2:15:57,  1.37s/it]
1 Failed download:
['INN-E']: Exception('%ticker%: No timezone found, symbol may be delisted')
 47%|██████████████████████████████████▍                                      | 5311/11267 [2:30:12<1:43:47,  1.05s/it]
1 Failed download:
['INN-F']: Exception('%ticker%: No timezone found, symbol may be delisted')
 48%|██████████████████████████████████▋                                      | 5354/11267 [2:31:14<1:28:20,  1.12it/s]
1 Failed download:
['IONQ+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 48%|██████████████████████████████████▊                                      | 5380/11267 [2:32:02<1:45:39,  1.08s/it]
1 Failed download:
['IPVF=']: Exception('%ticker%: No timezone found, symbol may be delisted')
 48%|██████████████████████████████████▊                                      | 5381/11267 [2:32:03<1:23:25,  1.18it/s]
1 Failed download:
[

 54%|███████████████████████████████████████                                  | 6037/11267 [2:47:44<1:51:08,  1.28s/it]
1 Failed download:
['LCW+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 54%|███████████████████████████████████████▎                                 | 6058/11267 [2:48:21<3:09:46,  2.19s/it]
1 Failed download:
['LEN.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 1924-09-22 -> 2023-08-30)')
 54%|███████████████████████████████████████▎                                 | 6064/11267 [2:48:30<2:14:53,  1.56s/it]
1 Failed download:
['LEV+A']: Exception('%ticker%: No timezone found, symbol may be delisted')
 54%|███████████████████████████████████████▎                                 | 6065/11267 [2:48:30<1:41:50,  1.17s/it]
1 Failed download:
['LEV+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 54%|███████████████████████████████████████▍                                 | 6081/11267 [2:48:47<1:42:52,  

 59%|███████████████████████████████████████████▎                             | 6681/11267 [3:04:01<2:10:32,  1.71s/it]
1 Failed download:
['MOG.A']: Exception('%ticker%: No timezone found, symbol may be delisted')
 59%|███████████████████████████████████████████▎                             | 6682/11267 [3:04:01<1:37:44,  1.28s/it]
1 Failed download:
['MOG.B']: Exception('%ticker%: No timezone found, symbol may be delisted')
 60%|███████████████████████████████████████████▌                             | 6714/11267 [3:04:46<2:01:40,  1.60s/it]
1 Failed download:
['MPLN+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 60%|███████████████████████████████████████████▋                             | 6737/11267 [3:05:21<1:51:00,  1.47s/it]
1 Failed download:
['MRDB+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 60%|███████████████████████████████████████████▋                             | 6751/11267 [3:05:44<1:49:59,  1.46s/it]
1 Failed download:
[

 63%|██████████████████████████████████████████████                           | 7118/11267 [3:15:43<1:51:51,  1.62s/it]
1 Failed download:
['NOTE+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 63%|██████████████████████████████████████████████▎                          | 7139/11267 [3:16:12<2:02:57,  1.79s/it]
1 Failed download:
['NPWR+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 63%|██████████████████████████████████████████████▎                          | 7149/11267 [3:16:25<1:17:03,  1.12s/it]
1 Failed download:
['NRDY+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 63%|██████████████████████████████████████████████▎                          | 7151/11267 [3:16:33<3:04:02,  2.68s/it]
1 Failed download:
['NREF-A']: Exception('%ticker%: No timezone found, symbol may be delisted')
 64%|██████████████████████████████████████████████▍                          | 7170/11267 [3:17:04<2:00:45,  1.77s/it]
1 Failed download:


 68%|█████████████████████████████████████████████████▉                       | 7706/11267 [3:38:22<1:15:23,  1.27s/it]
1 Failed download:
['PCG-E']: Exception('%ticker%: No timezone found, symbol may be delisted')
 68%|█████████████████████████████████████████████████▉                       | 7707/11267 [3:38:22<1:04:17,  1.08s/it]
1 Failed download:
['PCG-G']: Exception('%ticker%: No timezone found, symbol may be delisted')
 68%|███████████████████████████████████████████████████▎                       | 7708/11267 [3:38:23<50:51,  1.17it/s]
1 Failed download:
['PCG-H']: Exception('%ticker%: No timezone found, symbol may be delisted')
 68%|███████████████████████████████████████████████████▎                       | 7709/11267 [3:38:23<41:12,  1.44it/s]
1 Failed download:
['PCG-I']: Exception('%ticker%: No timezone found, symbol may be delisted')
 69%|██████████████████████████████████████████████████▏                      | 7748/11267 [3:40:18<3:19:37,  3.40s/it]
1 Failed download:
[

 72%|██████████████████████████████████████████████████████                     | 8118/11267 [3:55:02<39:36,  1.32it/s]
1 Failed download:
['PSA-J']: Exception('%ticker%: No timezone found, symbol may be delisted')
 72%|██████████████████████████████████████████████████████                     | 8119/11267 [3:55:03<32:42,  1.60it/s]
1 Failed download:
['PSA-K']: Exception('%ticker%: No timezone found, symbol may be delisted')
 72%|██████████████████████████████████████████████████████                     | 8120/11267 [3:55:03<28:19,  1.85it/s]
1 Failed download:
['PSA-L']: Exception('%ticker%: No timezone found, symbol may be delisted')
 72%|██████████████████████████████████████████████████████                     | 8121/11267 [3:55:04<31:24,  1.67it/s]
1 Failed download:
['PSA-M']: Exception('%ticker%: No timezone found, symbol may be delisted')
 72%|██████████████████████████████████████████████████████                     | 8122/11267 [3:55:05<42:36,  1.23it/s]
1 Failed download:
[

 77%|████████████████████████████████████████████████████████                 | 8650/11267 [4:14:12<2:24:15,  3.31s/it]
1 Failed download:
['RNR-G']: Exception('%ticker%: No timezone found, symbol may be delisted')
 77%|█████████████████████████████████████████████████████████▊                 | 8689/11267 [4:15:53<59:20,  1.38s/it]
1 Failed download:
['ROSS=']: Exception('%ticker%: No timezone found, symbol may be delisted')
 77%|█████████████████████████████████████████████████████████▊                 | 8690/11267 [4:15:53<45:43,  1.06s/it]
1 Failed download:
['ROSS+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 77%|████████████████████████████████████████████████████████▍                | 8705/11267 [4:16:25<1:56:40,  2.73s/it]
1 Failed download:
['RPT-D']: Exception('%ticker%: No timezone found, symbol may be delisted')
 77%|████████████████████████████████████████████████████████▍                | 8710/11267 [4:16:37<1:59:17,  2.80s/it]
1 Failed download:
[

 82%|█████████████████████████████████████████████████████████████▍             | 9225/11267 [4:32:29<40:42,  1.20s/it]
1 Failed download:
['SLND+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 82%|████████████████████████████████████████████████████████████▏            | 9287/11267 [4:34:36<1:13:34,  2.23s/it]
1 Failed download:
['SMR+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 83%|████████████████████████████████████████████████████████████▍            | 9334/11267 [4:35:59<1:00:15,  1.87s/it]
1 Failed download:
['SNV-D']: Exception('%ticker%: No timezone found, symbol may be delisted')
 83%|██████████████████████████████████████████████████████████████▏            | 9335/11267 [4:35:59<45:12,  1.40s/it]
1 Failed download:
['SNV-E']: Exception('%ticker%: No timezone found, symbol may be delisted')
 83%|████████████████████████████████████████████████████████████▊            | 9393/11267 [4:37:46<1:10:16,  2.25s/it]
1 Failed download:
['

 90%|██████████████████████████████████████████████████████████████████▎       | 10088/11267 [4:56:48<39:06,  1.99s/it]
1 Failed download:
['TRTL=']: Exception('%ticker%: No timezone found, symbol may be delisted')
 90%|██████████████████████████████████████████████████████████████████▎       | 10089/11267 [4:56:48<29:04,  1.48s/it]
1 Failed download:
['TRTL+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 90%|██████████████████████████████████████████████████████████████████▎       | 10091/11267 [4:56:51<27:11,  1.39s/it]
1 Failed download:
['TRTN-A']: Exception('%ticker%: No timezone found, symbol may be delisted')
 90%|██████████████████████████████████████████████████████████████████▎       | 10092/11267 [4:56:51<20:45,  1.06s/it]
1 Failed download:
['TRTN-B']: Exception('%ticker%: No timezone found, symbol may be delisted')
 90%|██████████████████████████████████████████████████████████████████▎       | 10093/11267 [4:56:51<16:11,  1.21it/s]
1 Failed download:

 96%|██████████████████████████████████████████████████████████████████████▉   | 10798/11267 [5:15:28<11:01,  1.41s/it]
1 Failed download:
['WBS-G']: Exception('%ticker%: No timezone found, symbol may be delisted')
 96%|██████████████████████████████████████████████████████████████████████▉   | 10800/11267 [5:15:29<07:55,  1.02s/it]
1 Failed download:
['WBX+']: Exception('%ticker%: No timezone found, symbol may be delisted')
 96%|██████████████████████████████████████████████████████████████████████▉   | 10803/11267 [5:15:32<09:49,  1.27s/it]
1 Failed download:
['WCC-A']: Exception('%ticker%: No timezone found, symbol may be delisted')
 96%|███████████████████████████████████████████████████████████████████████   | 10826/11267 [5:16:09<08:42,  1.18s/it]
1 Failed download:
['WEL=']: Exception('%ticker%: No timezone found, symbol may be delisted')
 96%|███████████████████████████████████████████████████████████████████████   | 10827/11267 [5:16:09<06:47,  1.08it/s]
1 Failed download:
['W

Total number of valid symbols downloaded = 9815





In [7]:
valid_data = data_clean[is_valid]
valid_data.to_csv('symbols_valid_meta.csv', index=False)

### Crypto 

In [9]:
# List of cryptocurrency symbols
symbols = ["BTC-USD", "ETH-USD", "LTC-USD", "XRP-USD", "BCH-USD", "ADA-USD", "DOT-USD", "LINK-USD", "BNB-USD", "XLM-USD"]

limit = limit if limit else len(symbols)
end = min(offset + limit, len(symbols))
is_valid = [False] * len(symbols)

with open(os.devnull, 'w') as devnull:
    with contextlib.redirect_stdout(devnull):
        for i in tqdm(range(offset, end)):
            s = symbols[i]
            data = yf.download(s, period=period)
            if len(data.index) == 0:
                continue
       
            data = optimize_dtypes(data)  
            data = add_all_ta_features_with_exceptions_and_cleanup(data)
            data = add_ticker_to_columns(data, s)

            is_valid[i] = True
            data.to_hdf('crypto/{}.h5'.format(s), key='df', mode='w') 

print('Total number of valid symbols downloaded = {}'.format(sum(is_valid)))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.13s/it]

Total number of valid symbols downloaded = 10





## Separating ETFs and Stocks

In [10]:
!mkdir stocks
!mkdir etfs

In [11]:
etfs = valid_data[valid_data['ETF'] == 'Y']['NASDAQ Symbol'].tolist()
stocks = valid_data[valid_data['ETF'] == 'N']['NASDAQ Symbol'].tolist()

In [12]:
def move_symbols(symbols, dest):
    for s in symbols:
        filename = '{}.h5'.format(s)
        shutil.move(join('hist', filename), join(dest, filename))
        
move_symbols(etfs, "etfs")
move_symbols(stocks, "stocks")

In [13]:
! rmdir hist

## Train/test split

### Stocks

In [14]:
!mkdir train
!mkdir test

In [15]:
# Path to the directory with the original hdf5 files
directory = "stocks/"

# Path to the directory where you want to save the train/test split files
train_directory = "train/"
test_directory = "test/"

# Iterate over the files in the original directory
for filename in os.listdir(directory):
    if filename.endswith(".h5"):
        # Load the data
        df = pd.read_hdf(os.path.join(directory, filename))
        
        # Skip datasets with fewer than 10 records
        if len(df) < 10:
            print(f"Skipping {filename} due to insufficient data.")
            continue

        # Split the data into a train and test set
        train, test = train_test_split(df, test_size=0.2)

        # Save the train and test sets into new hdf5 files
        train.to_hdf(os.path.join(train_directory, filename), key='train')
        test.to_hdf(os.path.join(test_directory, filename), key='test')

### Crypto

In [16]:
# Path to the directory with the original hdf5 files
directory = "crypto/"

# Path to the directory where you want to save the train/test split files
train_directory = "train/"
test_directory = "test/"

# Iterate over the files in the original directory
for filename in os.listdir(directory):
    if filename.endswith(".h5"):
        # Load the data
        df = pd.read_hdf(os.path.join(directory, filename))

        # Split the data into a train and test set
        train, test = train_test_split(df, test_size=0.2)

        # Save the train and test sets into new hdf5 files
        train.to_hdf(os.path.join(train_directory, filename), key='train')
        test.to_hdf(os.path.join(test_directory, filename), key='test')

## Target_day

In [17]:
df = pd.read_hdf("train/BTC-USD.h5")
df['target_day'] = (df['BTC-USD_Close'].shift(-1) > df['BTC-USD_Close']).astype(int)
df.to_hdf("train/BTC-USD.h5", key='train')

In [18]:
df = pd.read_hdf("test/BTC-USD.h5")
df['target_day'] = (df['BTC-USD_Close'].shift(-1) > df['BTC-USD_Close']).astype(int)
df.to_hdf("test/BTC-USD.h5", key='test')