In [None]:
import requests
import pandas as pd
from time import sleep
import ast
import alpaca_trade_api as tradeapi


# Alpha Vantage API Key https://www.alphavantage.co/support/#api-key
ALPHA_VANTAGE_API_KEY = ""

# Polygon
POLYGON_API_KEY = ""

# Alpaca
APCA_API_BASE_URL = "https://api.alpaca.markets"
APCA_API_KEY_ID = ""
APCA_API_SECRET_KEY = ""


equities = ['XOM','CVX', 'SHEL', 'COP', 'BP', 'PBR']
more_equities = ['WTI']

crude_oil = []#['CL=F', 'BZ=F'] # wti, brent, 

random = ["TSLA", "AAPL"]

tickers = equities + more_equities + crude_oil


def csv_str_to_df(decoded_content, ticker):
    """CSV string to df"""
    lines = decoded_content.splitlines()
    df = pd.DataFrame([row.split(',') for row in lines[1:]], 
                    columns=["date", *lines[0].split(',')[1:]])

    df = df.reset_index(drop=True).set_index('date')
    df = pd.concat([df], axis=1, keys=[ticker])
    return df

##### Get Data From Data Provider

In [None]:
def alpha_vantage_get_ticker_data(ticker, time="1min", year=1, month=1):
    """Function to get (ticker, year, month) data using alpha vantage's time series intraday extended API"""
    CSV_URL = f"https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol={ticker}&interval={time}&slice=year{year}month{month}&apikey={ALPHA_VANTAGE_API_KEY}"

    while True:
        with requests.Session() as s:
            download = s.get(CSV_URL)
            decoded_content = download.content.decode('utf-8')
            print(f"ticker: {ticker}, y{year} m{month}; response length: {len(decoded_content)}")

            if len(decoded_content) == 236:
                # API too many requests
                sleep(60)
            elif len(decoded_content) <= 243:
                # Token doesn't exist or something
                print(f"Error getting {ticker}, y{year}, m{month}. We are skipping")
                print(decoded_content)
                return None
            else:
                return csv_str_to_df(decoded_content, ticker)


def use_alpha_vantage(tickers, time= "1min", out_file="realdata.csv"):
    """Function to get multiple full tickers data using alpha vantage's time series intraday extended API"""
    dfs = []
    for ticker in tickers:
        t_dfs = []
        for year in range(1,3):
            for month in range(1,13):
                df_temp = alpha_vantage_get_ticker_data(ticker, time=time, year=year, month=month)
                if df_temp is not None:
                    t_dfs.append(df_temp)

        if len(t_dfs):
            dfs.append(pd.concat(t_dfs, axis=0))
        else:
            print(f"Skipped {ticker}.")
    df = pd.concat(dfs, axis=1, sort=True)
    df.index.rename('date', inplace=True)

    # Save flatten
    og_cols = df.columns.copy()
    df.columns = df.columns.to_flat_index()
    df.to_csv(out_file)
    df.columns = og_cols

    return df


def use_alpaca(tickers, alpaca, timeframe="1Minute", out_file="realdata_alp.csv"):
    start = "2017-01-01" # This is as early as it gets
    dfs = []
    for ticker in tickers:
        print("Getting", ticker)
        alpaca.get_bars()
        df = alpaca.get_bars(ticker, timeframe, start).df
        print("Recieved", ticker)
        df.index.name = 'date'
        df = pd.concat([df], axis=1, keys=[ticker])
        dfs.append(df)
    df = pd.concat(dfs, axis=1, sort=True)
    df.index.rename('date', inplace=True)

    # Save flatten
    og_cols = df.columns.copy()
    df.columns = df.columns.to_flat_index()
    df.to_csv(out_file)
    df.columns = og_cols

    return df

In [None]:
alpaca = tradeapi.REST(key_id=APCA_API_KEY_ID, secret_key=APCA_API_SECRET_KEY, base_url=APCA_API_BASE_URL)
account = alpaca.get_account()
print(account.status)

df = use_alpaca(tickers, alpaca, timeframe="1Minute", out_file="realdata_alp.csv")

In [None]:
df = use_alpha_vantage(tickers, out_file="realdata.csv")

In [None]:
# ticker = "XOM"
# year = 1
# month = 1

# # Minute
# with requests.Session() as s:
#     download = s.get(f"https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY_EXTENDED&symbol={'CVX'}&interval={'1min'}&slice=year{'1'}month{'1'}&apikey={ALPHA_VANTAGE_API_KEY}")
#     decoded_content = download.content.decode('utf-8')

#     print(decoded_content)

# # Daily
# with requests.Session() as s:
#     ticker = "XOM"
#     download = s.get(f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/2020-07-22/2020-07-22?adjusted=true&sort=asc&limit=5000&apiKey={POLYGON_API_KEY}")
#     decoded_content = download.content.decode('utf-8')

#     print(decoded_content)

# alpha_vantage_get_ticker_data("CVX")


# symbols = "XOM"
# timeframe = "1Minute"
# start = "2017-01-01"
# end = "2017-01-01"
# data = alpaca.get_bars(symbols, timeframe, start).df
# print(data.columns)

##### Read Data From All-Data CSV (Multi Index Columns)

In [None]:
def read_data(out_file="realdata.csv"):
    data = pd.read_csv(out_file, index_col=0)
    data.columns = data.columns.map(ast.literal_eval)
    return data

df_all = read_data("realdata.csv")
df_all.head()

# Filtering & Processing the Master Dataset

In [None]:
def just_close_data(data):
    return data.xs("close",level=1, axis=1)

def no_premarket_after_hours():
    pass

In [None]:
# Filter df_all to just Close data
df_close = just_close_data(df_all)
df_close.head()

##### Fill NaNs

In [None]:
def ffill_nans(df):
    df = df.fillna(method="ffill")
    df = df.dropna()
    return df

def del_nans_ffill(df, thresh):
    df = df.dropna(thresh=thresh)
    df = ffill_nans(df)
    return df

In [None]:
df_processed_dups = ffill_nans(df_close) # 138,607
df_processed_holes = del_nans_ffill(df_close, 3) # 91,693
# df_close # 348,724

##### Save Data

In [None]:
df_processed_dups.to_csv("data/ETT/processed_dups.csv")
df_processed_holes.to_csv("data/ETT/processed_holes.csv")