In [1]:
import polars as pl
from pathlib import Path
from config import STOCK_DATA_SAVE_DIR, TICKERS, TRAIN_TEST_SPLIT_PERCENT
import numpy as np

In [2]:
parquet_filename = Path(STOCK_DATA_SAVE_DIR) / "random-778-tickers.parquet"
df = pl.read_parquet(parquet_filename)[["Date", "Ticker", "Close"]]
df = df.filter(df["Ticker"].str.ends_with(".NS"))
TICKERS = [t for t in TICKERS if t.endswith(".NS")]
max_value_to_be_considered = df.groupby("Ticker").count()["count"].value_counts(sort=True).row(0)[0]
max_value_to_be_considered

2508

In [3]:
df.slice(10, -10)

Date,Ticker,Close
"datetime[ns, UTC]",str,f64
2022-03-15 03:45:00 UTC,"""ADANIGREEN.NS""",1826.699951
2022-03-15 03:45:00 UTC,"""ADANIPORTS.NS""",730.150024
2022-03-15 03:45:00 UTC,"""ADANIPOWER.NS""",125.599998
2022-03-15 03:45:00 UTC,"""ADANITRANS.NS""",2244.149902
2022-03-15 03:45:00 UTC,"""AEGISCHEM.NS""",192.399994
2022-03-15 03:45:00 UTC,"""AFFLE.NS""",1214.349976
2022-03-15 03:45:00 UTC,"""AIAENG.NS""",1680.0
2022-03-15 03:45:00 UTC,"""AJANTPHARM.NS""",1195.866699
2022-03-15 03:45:00 UTC,"""AKZOINDIA.NS""",1934.900024
2022-03-15 03:45:00 UTC,"""ALKEM.NS""",3410.100098


In [4]:
features_df = pl.DataFrame()


for i, ticker in enumerate(TICKERS):
    tmp_df = df.filter(pl.col("Ticker") == ticker).reverse()
    tmp_df = tmp_df.slice(0, max_value_to_be_considered).reverse()
    tmp_df = tmp_df.unique(subset=["Date"], maintain_order=True)
    if tmp_df.shape[0] == max_value_to_be_considered:
        tmp_df = tmp_df.with_columns([
            pl.col("Close").shift(hour).alias(f"PAST_{hour}_HOUR")
            for hour in range(1, 15)
        ])
        features_df = pl.concat([features_df, tmp_df], how="diagonal")

technical_indicators = [hour for hour in range(1, 15)]

features_df = features_df.drop_nulls()
features_df = features_df.sort("Date", descending=False)
features_df = features_df.filter(pl.count("Ticker").over("Date") == features_df["Ticker"].n_unique())
features_df

Date,Ticker,Close,PAST_1_HOUR,PAST_2_HOUR,PAST_3_HOUR,PAST_4_HOUR,PAST_5_HOUR,PAST_6_HOUR,PAST_7_HOUR,PAST_8_HOUR,PAST_9_HOUR,PAST_10_HOUR,PAST_11_HOUR,PAST_12_HOUR,PAST_13_HOUR,PAST_14_HOUR
"datetime[ns, UTC]",str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2022-03-17 04:45:00 UTC,"""IDEA.NS""",10.4,10.4,10.3,10.25,10.25,10.1,10.2,10.15,10.15,10.0,10.05,10.1,10.2,10.35,10.4
2022-03-17 04:45:00 UTC,"""SUZLON.NS""",9.95,10.0,10.0,9.95,9.95,10.0,10.0,10.0,9.95,9.8,9.75,9.75,9.95,10.1,10.1
2022-03-17 04:45:00 UTC,"""GMRINFRA.NS""",38.299999,38.549999,38.5,38.450001,38.450001,38.349998,38.549999,38.400002,38.349998,37.450001,37.650002,37.75,37.849998,38.400002,38.450001
2022-03-17 04:45:00 UTC,"""ZOMATO.NS""",78.800003,78.699997,76.0,75.849998,76.199997,75.949997,76.650002,76.849998,76.150002,76.849998,76.599998,77.150002,77.599998,77.650002,78.0
2022-03-17 04:45:00 UTC,"""YESBANK.NS""",12.8,12.8,12.8,12.8,12.8,12.75,12.8,12.8,12.75,12.65,12.75,12.75,12.7,12.75,12.75
2022-03-17 04:45:00 UTC,"""PNB.NS""",36.099998,36.200001,35.950001,35.950001,35.75,35.700001,35.950001,35.950001,36.049999,35.900002,36.0,36.150002,36.349998,36.799999,36.950001
2022-03-17 04:45:00 UTC,"""IRFC.NS""",22.200001,22.450001,22.299999,22.200001,22.15,22.15,22.200001,22.200001,22.25,22.200001,22.200001,22.200001,22.1,22.299999,22.299999
2022-03-17 04:45:00 UTC,"""BEL.NS""",69.316666,70.066666,69.666664,69.51667,69.48333,69.599998,69.333336,69.533333,69.166664,68.683334,68.400002,68.48333,69.083336,69.566666,69.466667
2022-03-17 04:45:00 UTC,"""TRIDENT.NS""",54.400002,54.549999,54.299999,54.25,54.099998,54.150002,54.349998,54.5,54.549999,54.700001,54.0,54.799999,55.049999,55.400002,55.25
2022-03-17 04:45:00 UTC,"""IDFCFIRSTB.NS""",42.599998,42.549999,42.0,42.150002,41.25,41.200001,41.400002,41.349998,41.25,40.650002,40.799999,40.900002,41.5,42.25,42.349998


In [5]:
total = features_df.groupby("Ticker").count().row(0)[1]
train_size = total - int(total * TRAIN_TEST_SPLIT_PERCENT)
test_size = total - train_size - 1

train_end_index = train_size * features_df["Ticker"].n_unique()
trade_end_index = test_size * features_df["Ticker"].n_unique()

train_df = features_df.slice(0, train_end_index)
trade_df = features_df.slice(train_end_index, trade_end_index)

In [21]:
cols = trade_df.columns
cols.remove("Date")
cols.remove("Ticker")

arr = []
for i, (name, data) in enumerate(trade_df.groupby("Date")):
    new_arr = data.select(cols).to_numpy()
    arr.append(new_arr)

trade_arrays = (np.asarray(arr))
len(trade_arrays)

372

In [10]:
with open(f"{STOCK_DATA_SAVE_DIR}/train-trade.npy", "rb") as f:
    train_arrays = np.load(f, allow_pickle=True, fix_imports=True)
    trade_arrays = np.load(f, allow_pickle=True, fix_imports=True)
    TICKERS = np.load(f, allow_pickle=True, fix_imports=True)
    TECHNICAL_INDICATORS = np.load(f, allow_pickle=True, fix_imports=True)

In [22]:
for arr in trade_arrays:
    print(len(arr))
    print(arr)
    break

352
[[7.40000010e+00 7.44999981e+00 7.40000010e+00 ... 7.19999981e+00
  7.05000019e+00 7.05000019e+00]
 [1.33999996e+01 1.33000002e+01 1.33500004e+01 ... 1.35500002e+01
  1.23999996e+01 1.21499996e+01]
 [4.21500015e+01 4.22999992e+01 4.22999992e+01 ... 4.21500015e+01
  4.17500000e+01 4.16500015e+01]
 ...
 [1.37513496e+04 1.37500000e+04 1.37540000e+04 ... 1.37660000e+04
  1.37700000e+04 1.37850000e+04]
 [2.67510000e+04 2.68433008e+04 2.70490000e+04 ... 2.74250000e+04
  2.74750000e+04 2.74500000e+04]
 [4.10671484e+04 4.09000000e+04 4.09000000e+04 ... 4.08899492e+04
  4.08998516e+04 4.09150000e+04]]
