### Imports

In [1]:
import os
import sys
import json
import boto3
import requests
import datetime
from tqdm.auto import tqdm
import asyncio
import aioboto3
from aiohttp import ClientSession
import numpy as np 
import pandas as pd 
from io import StringIO
import pandas_market_calendars as mcal

import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

import sys
sys.path.append("C:/Users/SHIRAM/Documents/Waves - All TimeSeries Related/")
from polygonio_api_calls import get_all_tickers, generate_urls, download_all

### Read Data Status

#### Functions

In [2]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx : min(ndx + n, l)]


async def get_one_object_from_s3(bucket: str, key: str) -> json:
    """
    Get JUST ONE object from S3, and push it into the redis database as a redis-json object.
    :param bucket: The main bucket
    :param key: The key that's to be used
    :return: the json decoded string
    """
    session = aioboto3.Session()
    async with session.client("s3") as s3:
        try:
            s3_obj = await s3.get_object(Bucket=bucket, Key=key)
            result = await s3_obj["Body"].read()
            result = json.loads(result.decode("utf-8"))

            with get_redis_conn() as re_db:
                re_db.jsonset(
                    name=key, path=rejson.Path.rootPath(), obj=result,
                )
                return 1
        except Exception as e:
            pass

def split_dates(date_str: str) -> list:
    """
    Just a helper function to change something from %Y-%m-%d to %Y, %m, %d.
    :param date_str: The %Y-%m-%d formatted date string.
    :return: the year, mon, day separated.
    """
    try:
        _ = datetime.datetime.strptime(date_str, "%Y-%m-%d")
    except ValueError as _:
        raise ValueError("Please ensure the date is of format:= %Y-%m-%d")

    return [date_str.split("-")[0], date_str.split("-")[1], date_str.split("-")[2]]

def generate_s3_paths(tickers: list, start: str, end: str, timeframe:str, adjusted:bool, current_data_status: dict):
    """
    Generate all keys to access the tickers from s3
    :param tickers: the tickers that we're looking for
    :param start: the start date, %Y-%m-%d
    :param end: the end date, %Y-%m-%d
    :param current_data_status: the currentDataStatusAdj.json
    :return: a list of keys
    """
    # determine which key start is located in
    print("-- Generating s3 paths...")
    start, end = pd.to_datetime(start), pd.to_datetime(end)
    indices = {"start_date_insert_key": [], "end_date_insert_key": []}
    current_data_status_focus = current_data_status[timeframe]
    for insert_date in current_data_status_focus.keys():
        from_to = np.unique(np.ravel(np.array(current_data_status_focus[insert_date])))
        from_to = sorted([pd.to_datetime(x) for x in from_to.tolist()])
        from_, to_ = from_to[0], from_to[-1]

        if from_ <= start <= to_:
            indices["start_date_insert_key"] = insert_date

        if from_ <= end <= to_:
            indices["end_date_insert_key"] = insert_date

    if len(indices["end_date_insert_key"]) == 0:
        print(
            "Looks like there's no end_date_insert_key, try pushing the end date back a few days..."
        )
        return None

    inserted_on_dates = sorted(
        [pd.to_datetime(x) for x in current_data_status_focus.keys()]
    )

    keys_in_between = [
        x.strftime("%Y-%m-%d")
        for x in inserted_on_dates
        if pd.to_datetime(indices["start_date_insert_key"])
        < x
        < pd.to_datetime(indices["end_date_insert_key"])
    ]

    all_keys = (
        [indices["start_date_insert_key"]]
        + keys_in_between
        + [indices["end_date_insert_key"]]
    )

    # urls are formatted as -> aggs/ins_yr/ins_mon/ins_day/{timespan}/{multiplier}/frm_yr/frm_mon/frm_day/to_yr/to_mon/to_year
    all_urls = []
    for i, key in enumerate(all_keys):
        from_to = current_data_status_focus[key]
        all_dates = np.unique(
            np.array([pd.to_datetime(x) for x in np.ravel(np.array(from_to))])
        ).tolist()

        if i == 0:
            all_dates_ranges = np.array(
                [
                    x.strftime("%Y-%m-%d")
                    for x in pd.date_range(start, all_dates[-1], freq="1d")
                ]
            )
        else:
            all_dates_ranges = np.array(
                [
                    x.strftime("%Y-%m-%d")
                    for x in pd.date_range(all_dates[0], all_dates[-1], freq="1d")
                ]
            )

        fmt_dates_ranges = [
            [x, y] for x, y in zip(np.roll(all_dates_ranges, 1), all_dates_ranges)
        ][1:]

        for f, t in fmt_dates_ranges:
            ins_y, ins_m, ins_d = split_dates(date_str=key)
            [f_y, f_m, f_d], [t_y, t_m, t_d] = (
                split_dates(date_str=f),
                split_dates(date_str=t),
            )
            if adjusted:
                all_urls += [
                    f"aggs/adj/{ins_y}/{ins_m}/{ins_d}/day/1/{f_y}/{f_m}/{f_d}/{t_y}/{t_m}/{t_d}/{ticker}/data.json"
                    for ticker in tickers
                ]
            else:
                all_urls += [
                    f"aggs/{ins_y}/{ins_m}/{ins_d}/day/1/{f_y}/{f_m}/{f_d}/{t_y}/{t_m}/{t_d}/{ticker}/data.json"
                    for ticker in tickers
                ]

    print("-- Done.")
    return all_urls

def make_agg_urls(
    tickers: list,
    start_date: datetime.datetime,
    end_date: datetime.datetime,
    timespan: str,
    api_key: str,
):
    host = "https://api.polygon.io/v2/aggs"
    params = {"sort": "asc", "limit": "1000", "apiKey": api_key}
    column_order = ["ticker", "timestamp", "open", "high", "low", "close", "volume", "vwap", "n"]
    all_dfs = []
    for ticker in tickers:
        url = f"{host}/ticker/{ticker}/range/1/{timespan}/{start_date}/{end_date}"
        response = requests.get(url, params=params)
        if response.status_code == 200:
            response = response.json()
            df_ = pd.DataFrame(response["results"])
            df_.columns = [
                "volume",
                "vwap",
                "open",
                "close",
                "high",
                "low",
                "timestamp",
                "n",
            ]
            df_["ticker"] = response["ticker"]
            df_.loc[:, "timestamp"] = pd.to_datetime(df_["timestamp"], unit="ms")
            df_ = df_[column_order]
            all_dfs.append(df_)
    res_df = pd.concat(all_dfs)
    return res_df

if "postprocess_downloads" in locals(): 
    del postprocess_downloads
    
def postprocess_downloads(all_results: list): 
    results = []
    for res in all_results:
        if res and ("results" in res.keys()):
            r = res["results"]
            for ele in r:
                ele["ticker"] = res["ticker"]
                results.append(ele)

    df = pd.DataFrame.from_records(results)
    df.loc[:, "t"] = pd.to_datetime(df["t"], unit="ms")
    df["t"] = df["t"].dt.date
    
    df = df.drop_duplicates(subset=["t", "ticker"], inplace=False)
    p_df = df.pivot(columns=["ticker"], index=["t"], values=["c"]).reset_index()
    
    cols = p_df.columns.droplevel(0).tolist()
    cols[0] = "datetime"
    p_df.columns = cols
    
    p_df = p_df.set_index("datetime", inplace=False)
    return p_df

def make_all_returns(all_df: pd.DataFrame, tickers: list, calendar_schedule: pd.DataFrame):
    ### Convert all these prices to returns
    all_returns_dfs = []
    for ticker in tqdm(tickers):
        df_ = all_df.loc[all_df.index > datetime.datetime.strptime("2019-01-01", "%Y-%m-%d").date(), ticker]
        df_ = pd.DataFrame(df_, columns=[ticker])
        df_ = pd.merge(left=df_, right=calendar_schedule, left_index=True, right_index=True)
        df_ = df_.drop(columns=["market_open", "market_close"], inplace=False)
        df_[f"{ticker}_ret"] = ( df_[ticker].diff() / df_[ticker].shift(1) ).fillna(0.0)
        df_ = df_.drop(columns=[ticker], inplace=False)
        all_returns_dfs.append(df_)
        
    returns_df = pd.concat(all_returns_dfs, axis=1)
    return returns_df

### Fetch

In [18]:
tickers = get_all_tickers(active=True, limit=1000, only_tickers=True)
currencies = [x for x in tickers if ("X:" in x) or ("C:" in x)]
stocks = sorted(list(set(tickers) - set(currencies)))

In [19]:
params = {
    "tickers": stocks,
    "from_": datetime.datetime(year=2015, month=1, day=1).strftime("%Y-%m-%d"),
    "to_": datetime.datetime.today().strftime("%Y-%m-%d"),
    "multiplier": "1",
    "timespan": "day",
    "endpoint": "aggs",
    "adjusted": True,
}

all_urls = generate_urls(all_params=params)

In [20]:
loop = asyncio.get_running_loop()
all_results = loop.create_task(download_all(all_urls=all_urls))

100%|███████████████████████████████████████████████████████████████████| 11079/11079 [02:09<00:00, 85.42it/s]


In [21]:
all_results_list = all_results.result()
all_df = postprocess_downloads(all_results=all_results_list)

In [23]:
# Create a calendar
nyse_calendar = mcal.get_calendar('NYSE')
calendar_schedule = nyse_calendar.schedule(start_date=all_df.index[0], end_date=all_df.index[-1])

In [24]:
returns_df = make_all_returns(all_df=all_df, tickers=tickers, calendar_schedule=calendar_schedule)

  0%|          | 0/11070 [00:00<?, ?it/s]

### Get all ticker details

### Push into zipline

In [None]:
all_df.to_csv("../data/all_df.csv")
returns_df.to_csv("../data/returns_df.csv")

In [29]:
all_df

Unnamed: 0_level_0,A,AA,AAA,AAAU,AAC,AAC.U,AAC.WS,AACG,AACIU,AADI,...,ZUO,ZVIA,ZVO,ZWRK,ZWRKU,ZWRKW,ZY,ZYME,ZYNE,ZYXI
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,40.56,47.64,,,30.21,,,,,,...,,,,,,,,,,
2015-01-05,39.80,44.88,,,29.73,,,,,,...,,,,,,,,,,
2015-01-06,39.18,45.21,,,29.38,,,,,,...,,,,,,,,,,
2015-01-07,39.70,46.38,,,28.64,,,,,,...,,,,,,,,,,
2015-01-08,40.89,47.70,,,29.37,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-03,179.28,47.21,25.005,18.1898,9.72,9.90,0.9500,2.77,,27.59,...,18.40,16.91,2.55,,9.85,0.6494,14.035,32.47,4.45,13.18
2021-09-07,177.72,46.77,25.005,17.8400,9.72,9.90,0.9500,2.82,9.85,26.48,...,18.00,16.12,2.57,,9.85,0.6700,13.380,32.04,4.46,13.12
2021-09-08,178.73,45.73,25.005,17.7800,9.71,9.88,0.9469,2.71,,27.67,...,17.12,14.21,2.39,,,0.6199,13.180,31.99,4.32,12.90
2021-09-09,177.23,47.51,25.005,17.8700,9.74,9.91,0.9204,2.78,9.88,27.53,...,17.25,13.95,2.40,9.67,9.86,0.6300,13.190,32.12,4.55,12.77


### Caterpillar

In [88]:
import io
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error

urls = ["http://www.cryptodatadownload.com/cdd/gemini_BTCUSD_1hr.csv", ]
response = requests.get(btc_url, verify=False).content
btc_df = pd.read_csv(io.StringIO(response.decode('utf-8')), skiprows=1)

btc_df = btc_df.drop(columns=["Unix Timestamp"], inplace=False)
btc_df.columns = [x.lower() for x in btc_df.columns.tolist()]
btc_df.loc[:, "date"] = pd.to_datetime(btc_df["date"])
btc_df = btc_df.sort_values(by="date", ascending=True)
btc_df = btc_df.reset_index(inplace=False, drop=True)
btc_df = btc_df.set_index("date")

In [129]:
dur_past, dur_pred, n_similar = 14, 28, 5

price_change = (btc_df["close"] / btc_df["close"].shift(1)).fillna(0.0)
price_tail = price_change.tail(14)
price_err = [0] * ( len(price_change) - dur_past - dur_pred )

In [167]:
for i in tqdm(range(1, len(price_err))):
    price_err[i] = mean_squared_error(y_true=price_tail, y_pred=price_change[0 + (i - 1): (dur_past + (i - 1))], squared=False)

  0%|          | 0/51830 [00:00<?, ?it/s]

In [168]:
price_err

[0,
 0.267022984482203,
 0.01919198340319353,
 0.01792435602042921,
 0.018647393081803652,
 0.018872585637371132,
 0.018589794837170256,
 0.01884185337334251,
 0.018670652323728314,
 0.018408996094188675,
 0.018716106223480872,
 0.018756344752161477,
 0.018757698854724274,
 0.018750307750208296,
 0.018723884045121677,
 0.018790291620336144,
 0.018727336143936683,
 0.018804811780802597,
 0.018758290028497774,
 0.020204569226604685,
 0.0200342421299385,
 0.02017008969852802,
 0.021570454670850604,
 0.019894833313420547,
 0.02108492787173195,
 0.019526895470236942,
 0.02193218917260392,
 0.017760519096240747,
 0.022991319271182517,
 0.024378238072106158,
 0.016709225562527758,
 0.019985247659445312,
 0.020690086011411698,
 0.01920391548120709,
 0.018851603349988196,
 0.0187953026965501,
 0.018865056930472517,
 0.01864893697104658,
 0.01876685543719162,
 0.018758492181036243,
 0.018837352321458955,
 0.01862164854390269,
 0.018942040534088135,
 0.01875691012687386,
 0.017790535375682013,
 0

In [165]:
closest_predictions = sorted(price_err)[:5]
closest_predictions_incides = [list(np.argwhere(price_err == ele))[0][0] if len(np.argwhere(price_err == ele)) > 0 else 0.0001 for ele in closest_predictions ]

In [160]:
weighted_preds = [0] * dur_pred

In [163]:
1/sum(closest_predictions)

3870.9131856506224

In [166]:
for i in range(n_similar):
    weighted_preds += price_change[(dur_past + closest_predictions_incides[i]): (dur_past + dur_pred + closest_predictions_incides[i] - 1)] * ( (1/closest_predictions[i]) / 1/sum(closest_predictions))

TypeError: cannot do slice indexing on DatetimeIndex with these indexers [14.0001] of type float