In [1]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [2]:
import sys
sys.path.append("../")
import os
import datetime
import pandas as pd
import pandas_market_calendars as mcal
from polygon import RESTClient
#from tqdm.notebook import tqdm
from progress.bar import Bar
import time
import pytz
eastern = 'US/Eastern'

In [3]:
trade_map={"I":"orig_id","x":"exchange","p":"price","i":"id","e":"correction","r":"trf_id","t":"sip_timestamp","y":"participant_timestamp","f":"trf_timestamp","q":"sequence_number","c":"conditions","s":"size","z":"tape"
}

trade_conditions = {'0': 'Regular', '1': 'Acquisition', '2': 'AveragePrice', '3': 'AutomaticExecution', '4': 'Bunched', '5': 'BunchSold', '6': 'CAPElection', '7': 'CashTrade', '8': 'Closing', '9': 'Cross', '10': 'DerivativelyPriced', '11': 'Distribution', '12': 'FormT(ExtendedHours)', '13': 'FormTOutOfSequence', '14': 'InterMarketSweep', '15': 'MarketCenterOfficialClose', '16': 'MarketCenterOfficialOpen', '17': 'MarketCenterOpening', '18': 'MarketCenterReOpenning', '19': 'MarketCenterClosing', '20': 'NextDay', '21': 'PriceVariation', '22': 'PriorReferencePrice', '23': 'Rule155Amex', '24': 'Rule127Nyse', '25': 'Opening', '26': 'Opened', '27': 'RegularStoppedStock', '28': 'ReOpening', '29': 'Seller', '30': 'SoldLast', '31': 'SoldLastStoppedStock', '32': 'SoldOutOfSequence', '33': 'SoldOutOfSequenceStoppedStock', '34': 'Split', '35': 'StockOption', '36': 'YellowFlag', '37': 'OddLot', '38': 'CorrectedConsolidatedClosePrice', '39': 'Unknown', '40': 'Held', '41': 'TradeThruExempt', '42': 'NonEligible', '43': 'NonEligible-extended', '44': 'Cancelled', '45': 'Recovery', '46': 'Correction', '47': 'AsOf', '48': 'AsOfCorrection', '49': 'AsOfCancel', '50': 'OOB', '51': 'Summary', '52': 'Contingent', '53': 'Contingent(Qualified)', '54': 'Errored'}

In [4]:
key = "qRFktbNL6A_WzaFhqR80IllApsmjPY6a"

def ts_to_datetime(ts) -> str:
    return datetime.datetime.fromtimestamp(ts / 1000.0).strftime('%Y-%m-%d %H:%M')

def get_list_of_Exchanges():

    with RESTClient(key) as client:
        respex = client.stocks_equities_exchanges()

        stock_exchanges = respex.exchange
        exchange_list_dict = {}

        for i in stock_exchanges:
            the_dict = {}
            the_dict[i.i_d_of_the_exchange] = i.name
            #exchange_list_dict.append(the_dict)
            exchange_list_dict[i.i_d_of_the_exchange] = i.name
        return exchange_list_dict

def get_list_of_trade_conditions():

    with RESTClient(key) as client:
        respex = client.stocks_equities_condition_mappings()

        stock_exchanges = respex.exchange
        exchange_list_dict = {}

        for i in stock_exchanges:
            the_dict = {}
            the_dict[i.i_d_of_the_exchange] = i.name
            #exchange_list_dict.append(the_dict)
            exchange_list_dict[i.i_d_of_the_exchange] = i.name
        return exchange_list_dict
    
def update_condition_labels(keyFunction, values):
    k =  list(keyFunction[str(v)] for v in list(values))
    return ", ".join(k)


In [5]:
exchange_list_dict= get_list_of_Exchanges()
trade_conds = pd.DataFrame(list(trade_conditions.items()),columns = ['condition','meaning']) 
trade_conds.to_csv('trade_cond.csv')

In [6]:
def get_trades(ticker, date, size):
    
    with RESTClient(key) as client:
        resp = client.historic_trades_v2(ticker=str(ticker).upper(), date=date, limit=size, reverse=False)
        return resp.__dict__
    
def get_trades_all(ticker, date, size):
    total_data = []
    lasttimestamp = 0
    while True:
        try:
            with RESTClient(key) as client:
                if lasttimestamp == 0:
                    resp = client.historic_trades_v2(ticker=str(ticker).upper(), date=date, limit=size, reverse=False)
                else:
                    resp = client.historic_trades_v2(ticker=str(ticker).upper(), date=date, timestamp = lasttimestamp, limit=size, reverse=False)
                result = resp.__dict__
                if "results" in result:
                    result = result['results']
                else:
                    break
                
                try:
                    lasttimestamp = result[-1]['t']
                    total_data.extend(result)
                except:
                    print("error last timestamp - "+ ticker)
                    break
                    
                if len(result) == 50000:
                    continue
                else:
                    break
        except:
            print("error get all - "+ ticker)
            break
                
    return total_data

In [7]:
def extract_qualified_symbols(df, plvl, advlvl):
    qualified = df[(df.p_lvl == plvl) & (df.adv_lvl == advlvl)]
    return qualified

def data_preprocessing(trades):
    trades_df = pd.DataFrame(trades).rename(columns=trade_map)
    trades_df['sip_timestamp'] =pd.to_datetime(trades_df['sip_timestamp']).dt.tz_localize('utc').dt.tz_convert(eastern)
    trades_df['participant_timestamp'] =pd.to_datetime(trades_df['participant_timestamp']).dt.tz_localize('utc').dt.tz_convert(eastern)

    trades_df['hours'] = trades_df['sip_timestamp'].apply(lambda x: x.hour)
    trades_df['minutes'] = trades_df['sip_timestamp'].apply(lambda x: x.minute)
    trades_df['seconds'] = trades_df['sip_timestamp'].apply(lambda x: x.second)
    
    trades_df['time']=trades_df['sip_timestamp'].dt.time
    trades_df['conditions_string'] = trades_df['conditions'].astype(str)

    return trades_df

def extract_opening_data(trades_df):
    trades_df['conditions_string'] = trades_df['conditions'].astype(str)

    open_= trades_df[trades_df.conditions_string.str.contains("16")]
    open_ = open_.sort_values('size',ascending=False)

    open_price= list(open_['price'])[0]
    open_size= list(open_['size'])[0]
    open_datetime = list(open_['sip_timestamp'])
    open_time = list(open_['sip_timestamp'].dt.time)[0]
    
    to_return = {
        'open_price':open_price,
        'open_time':open_time,
        'open_size':open_size
    }
    
    return to_return

def extract_closing_data(trades_df):
    
    close= trades_df[trades_df.conditions_string.str.contains("15")]
    close = close.sort_values('size',ascending=False)
    close_price= list(close['price'])[0]
    close_size= list(close['size'])[0]
    close_datetime = list(close['sip_timestamp'])[0]
    close_time = list(close['sip_timestamp'].dt.time)[0]
    
    to_return = {
        'close_price':close_price,
        'close_time':close_time,
        'close_size':close_size
    }
    return to_return

def aggregate_premarket_data(trades_df, open_time):
    premarket= trades_df[trades_df.time < open_time]
    pre_max_price = premarket.price.max()
    pre_min_price = premarket.price.min()
    pre_std_price = premarket.price.std()
    pre_median_price = premarket.price.quantile(0.5)
    pre_volume = premarket.size.sum()
    
    to_return = {
        'pre_max_price':pre_max_price,
        'pre_min_price': pre_min_price,
        'pre_std_price': pre_std_price,
        'pre_median_price': pre_median_price,
        'pre_volume':pre_volume
    }
    
    return to_return

def aggregate_intraday_data(trades_df, open_time, close_time):
    regmarket= trades_df[(trades_df.time >= open_time) & (trades_df.time <= close_time)]
    reg_max_price = regmarket.price.max()
    reg_min_price = regmarket.price.min()
    reg_std_price = regmarket.price.std()
    reg_median_price = regmarket.price.quantile(0.5)
    reg_volume = regmarket.size.sum()

    to_return = {
        'intraday_max_price':reg_max_price,
        'intraday_min_price':reg_min_price, 
        'intraday_std_price':reg_std_price,
        'intraday_median_price':reg_median_price, 
        'intraday_volume':reg_volume
    }
    return to_return    

def aggregate_postmarket_data(trades_df, close_time):
    postmarket= trades_df[trades_df.time > close_time]
    post_max_price = postmarket.price.max()
    post_min_price = postmarket.price.min()
    post_std_price = postmarket.price.std()
    post_median_price = postmarket.price.quantile(0.5)
    post_volume = postmarket.size.sum()

    
    to_return = {
        'post_max_price':post_max_price,
        'post_min_price':post_min_price, 
        'post_std_price':post_std_price,
        'post_median_price':post_median_price, 
        'post_volume':post_volume
    }
    return to_return

def append_aggregation_to_df():
    return

def get_valid_trading_days_for_range(start, end, n):
    nyse = mcal.get_calendar('NYSE')
    days = nyse.valid_days(start_date=start, end_date=end)
    days = [x.strftime('%Y-%m-%d') for x in days]
    
    if n > len(days):
        return list(days)
    nn = n-1
    return list(days)[-nn:]



# MANUAL PROCESS

- make request
- create additional columns
- extract open and close data
- aggregate pre and post market data
- append to df


In [8]:
# main_df = pd.read_csv('stocks-profile.csv')


In [9]:
# Extract required symbols from data
# q_sym = extract_qualified_symbols(main_df, 4, 2)
# q_sym_list = list(q_sym['symbol'])


In [10]:
#Get polygon data for symbol
# response_data = get_trades_all(q_sym_list[2], "2021-02-17", 50000)

In [11]:
# resp_df = data_preprocessing(response_data)


In [12]:
# open_data_extract = extract_opening_data(resp_df)

# close_data_extract = extract_closing_data(resp_df)

# premarket_aggregate = aggregate_premarket_data(resp_df, open_data_extract['open_time'])

# postmarket_aggregate = aggregate_postmarket_data(resp_df, close_data_extract['close_time'])

# intraday_aggregate = aggregate_intraday_data(resp_df, open_data_extract['open_time'], close_data_extract['close_time'])


In [13]:

# merged_data = {**open_data_extract, **close_data_extract, **premarket_aggregate, **intraday_aggregate, **postmarket_aggregate, **{"symbol":q_sym_list[0]}}
# print(merged_data)


In [14]:
# pd.DataFrame([merged_data])

# AUTOMATED CODE VERSION

In [15]:
def batch_symbols_processing(symbols_df, pvlvl, advlvl, date):
    q_sym = extract_qualified_symbols(symbols_df, pvlvl, advlvl)
    q_sym_list = list(q_sym['symbol'])
    final_result = []

    for sym in q_sym_list:
        
        try:
            response_data = get_trades_all(sym, date, 50000)
            resp_df = data_preprocessing(response_data)

            open_data_extract = extract_opening_data(resp_df)
            close_data_extract = extract_closing_data(resp_df)

            premarket_aggregate = aggregate_premarket_data(resp_df, open_data_extract['open_time'])
            postmarket_aggregate = aggregate_postmarket_data(resp_df, close_data_extract['close_time'])

            intraday_aggregate = aggregate_intraday_data(resp_df, open_data_extract['open_time'], close_data_extract['close_time'])

            merged_data = {**open_data_extract, **close_data_extract, **premarket_aggregate, **intraday_aggregate, **postmarket_aggregate, **{"symbol":sym}}

            final_result.append(merged_data)

            time.sleep(0.1)
        
        except:
            print(sym, date)
            print ("error")
    
    return pd.DataFrame(final_result)


In [16]:
#complete_result = batch_symbols_processing(main_df, 4, 2, '2021-02-17')

In [17]:
#complete_result.head()

In [18]:
#complete_result.to_csv("data_pv4_adv2.csv")

In [19]:
def create_folder(p, ad):
    print("progress")
    curr_folder = os.getcwd()
    currtime = datetime.datetime.now().strftime("%d-%m-%Y_t-%H.%M")
    new_folder = "new__"+currtime+'_pv_adv_'+str(p)+'_'+str(ad)
    path = os.path.join(curr_folder, new_folder)
    mode = 0o777
    os.mkdir(path, mode)
    
    print("Directory '% s' created" % new_folder)

    return new_folder
    

# AUTOMATE DATES AND CODE

In [20]:
valid_days = get_valid_trading_days_for_range('2020-04-01','2020-09-10', 70)

In [21]:
symbols_df = pd.read_csv('stocks-profile.csv')
pv_lv = 4
adv_lv = 2

In [23]:
#print("progress")
#curr_folder = os.getcwd()
#currtime = datetime.datetime.now().strftime("%d-%m-%Y_t-%H.%M")
#new_folder = currtime+'_pv_adv_'+str(pv_lv)+'_'+str(adv_lv)
#path = os.path.join(curr_folder, new_folder)
#mode = 0o777
#os.mkdir(path, mode)
#print("Directory '% s' created" % new_folder)


# for k in tqdm(range(4)):
k=4
new_folder = create_folder(k, adv_lv)
for i in tqdm(range(len(valid_days))):

    day = valid_days[i]
    complete_result = batch_symbols_processing(symbols_df, k, adv_lv, day)
    complete_result.to_csv(str(new_folder)+'/trades-'+
                           str(day) + 
                           '.csv')

    
    

progress
Directory 'new__15-04-2021_t-05.20_pv_adv_4_2' created


HBox(children=(IntProgress(value=0, max=69), HTML(value='')))

error last timestamp - ABNB
ABNB 2020-06-04
error
error last timestamp - SNOW
SNOW 2020-06-04
error
error last timestamp - DASH
DASH 2020-06-04
error
error last timestamp - ABNB
ABNB 2020-06-05
error
error last timestamp - SNOW
SNOW 2020-06-05
error
error last timestamp - DASH
DASH 2020-06-05
error
error last timestamp - ABNB
ABNB 2020-06-08
error
error last timestamp - SNOW
SNOW 2020-06-08
error
error last timestamp - DASH
DASH 2020-06-08
error
error last timestamp - ABNB
ABNB 2020-06-09
error
error last timestamp - SNOW
SNOW 2020-06-09
error
error last timestamp - DASH
DASH 2020-06-09
error
error last timestamp - ABNB
ABNB 2020-06-10
error
error last timestamp - SNOW
SNOW 2020-06-10
error
error last timestamp - DASH
DASH 2020-06-10
error
error last timestamp - ABNB
ABNB 2020-06-11
error
error last timestamp - SNOW
SNOW 2020-06-11
error
error last timestamp - DASH
DASH 2020-06-11
error
error last timestamp - ABNB
ABNB 2020-06-12
error
error last timestamp - SNOW
SNOW 2020-06-12
error


error last timestamp - DASH
DASH 2020-08-20
error
error last timestamp - ABNB
ABNB 2020-08-21
error
error last timestamp - SNOW
SNOW 2020-08-21
error
error last timestamp - DASH
DASH 2020-08-21
error
error last timestamp - ABNB
ABNB 2020-08-24
error
error last timestamp - SNOW
SNOW 2020-08-24
error
error last timestamp - DASH
DASH 2020-08-24
error
error last timestamp - ABNB
ABNB 2020-08-25
error
error last timestamp - SNOW
SNOW 2020-08-25
error
error last timestamp - DASH
DASH 2020-08-25
error
error last timestamp - ABNB
ABNB 2020-08-26
error
error last timestamp - SNOW
SNOW 2020-08-26
error
error last timestamp - DASH
DASH 2020-08-26
error
error last timestamp - ABNB
ABNB 2020-08-27
error
error last timestamp - SNOW
SNOW 2020-08-27
error
error last timestamp - DASH
DASH 2020-08-27
error
error last timestamp - ABNB
ABNB 2020-08-28
error
error last timestamp - SNOW
SNOW 2020-08-28
error
error last timestamp - DASH
DASH 2020-08-28
error
error last timestamp - ABNB
ABNB 2020-08-31
error
