In [1]:
import pandas as pd
from scipy import stats
from random import sample 
from tqdm import tqdm
from pathlib import Path
import pickle
import schedule
import time
from datetime import datetime

from alpha_vantage.timeseries import TimeSeries

### Load the api_key

In [2]:
def load_key():
    f = open("alpha_vantage_keyapi.txt","r")
    return f.read()
    

def log(msg):
    """
    """
    f = open("crawler.log","a+")
    return f.writelines("{0:%d-%m-%y %H:%M}. {1} \n".format(datetime.now(), msg))
    


In [3]:

def parse_symbol(s):
    """
    """
    symbol = str(s.Symbol).split("/")[0].strip()
    symbol = symbol.replace("^", ".")
    symbol += "_{0}".format(s.Market)
    
    return symbol


def load_symbols(market):
    """
    """
    
    # load the csv
    df = pd.read_csv("watchlist-symbols.csv",sep=";")
    
    # filter by market
    if market != "all":
        
        # checking if the market in present in the watching list
        if market not in df["Market"].unique():
            raise Exception('the specified market is not in the watchlist')
        
        df = df[df["Market"]==market]
    
    # drop non
    df.dropna(inplace=True)
    
    # adapt the symbols for the api
    lst_of_symbols = df.apply(lambda x: parse_symbol(x), axis=1).tolist()
    
    # take a sample for test
    # lst_of_symbols = sample(lst_of_symbols,4)
    
    # exclude etfs
    lst_tfs = ["EXSH"]
    lst_of_symbols = [x for x in lst_of_symbols if x not in lst_tfs]
    
    return lst_of_symbols


# for test
# lst_of_symbols = load_symbols("MIL")
# print(len(lst_of_symbols), lst_of_symbols)

In [4]:

def save_data(ts_data):
    """
    """
    
    with open('ts_data.pickle', 'wb') as handle:
        pickle.dump(ts_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return ts_data

def load_data():
    """
    """
    
    if Path("ts_data.pickle").is_file():
        with open('ts_data.pickle', 'rb') as handle:
            ts_data = pickle.load(handle)
    else:
        ts_data = save_data({})
    return ts_data
        

def query_api(symbol):
    """
    """
    market = symbol.split("_")[1]
    
    if market in ["NSY", "NDQ"]:
        # only ex: IBM
        symbol = symbol.split("_")[0]
    else: 
        symbol = symbol.replace("_", ".")
    
    data, meta_data = ts.get_daily_adjusted(symbol=symbol, outputsize="compact")
    return data, meta_data


def craw_data(market="all"):
    """
    """
    
    lst_of_symbols = load_symbols(market)
    for symbol in tqdm(lst_of_symbols):
        try:
            # wait
            time.sleep(15)

            # load the dict
            ts_data = load_data()

            # update the data
            ts_data.update({symbol: query_api(symbol)})        
            
            # save the new data
            save_data(ts_data)
            
            # log
            log("successfully crawled: {0}".format(symbol))
        except Exception as e: 
            print(symbol, e)
            log("error: {0} {1}".format(symbol, e))



In [5]:
# setup
api_key = load_key()
ts = TimeSeries(key=api_key, output_format='pandas', indexing_type='integer')    

### Run the Crawler

In [6]:
# callling the crawler manually 
runManual = True
if runManual:
    craw_data()

 34%|███▎      | 32/95 [08:32<16:38, 15.85s/it]

BAYN_XET Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY_ADJUSTED.


 60%|██████    | 57/95 [15:07<09:59, 15.79s/it]

HEIA_EAM Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY_ADJUSTED.


 78%|███████▊  | 74/95 [19:36<05:32, 15.84s/it]

RDS.A_NSY Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY_ADJUSTED.


 79%|███████▉  | 75/95 [19:52<05:19, 15.97s/it]

RR._LSE Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY_ADJUSTED.


 80%|████████  | 76/95 [20:08<05:03, 15.99s/it]

RBS_LSE Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY_ADJUSTED.


 91%|█████████ | 86/95 [22:47<02:22, 15.85s/it]

ULVR_LSE Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY_ADJUSTED.


 94%|█████████▎| 89/95 [23:35<01:36, 16.02s/it]

VOW3_XET Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY_ADJUSTED.


 99%|█████████▉| 94/95 [24:54<00:15, 15.96s/it]

EZJ_LSE Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY_ADJUSTED.


100%|██████████| 95/95 [25:10<00:00, 15.90s/it]

EXSH_XET Invalid API call. Please retry or visit the documentation (https://www.alphavantage.co/documentation/) for TIME_SERIES_DAILY_ADJUSTED.





In [7]:
def job():
    # Digits 0-6 represent the consecutive days of the week, starting from Monday.
    weekdayno = datetime.today().weekday()
    if weekno<5:
        log("start crawler job.")
        craw_data()
    
    
schedule.every().day.at("10:30").do(job)
schedule.every().day.at("12:30").do(job)
schedule.every().day.at("16:30").do(job)
schedule.every().day.at("20:30").do(job)

runSchedule = False
while runSchedule:
    schedule.run_pending()
    time.sleep(1)


In [8]:
load_data().keys()

dict_keys(['HER_MIL', 'SRG_MIL', 'FB_NDQ', 'DIS_NSY', 'QCOM_NDQ', 'MMM_NSY', 'EXPE_NDQ', 'XOM_NSY', 'BA_NSY', 'WFC_NSY', 'UCG_MIL', 'TIT_MIL', 'SPG_NSY', 'STM_MIL', 'RVLV_NSY', 'JNJ_NSY', 'ISP_MIL', 'IG_MIL', 'G_MIL', 'GRMN_NDQ', 'ENEL_MIL', 'DOV_MIL', 'BRK.B_NSY', 'BK_NSY', 'ATL_MIL', 'AAPL_NDQ', 'GOOGL_NDQ', 'ADBE_NDQ', 'T_NSY', 'A2A_MIL', 'BABA_NSY', 'AMRS_NDQ', 'ADS_NSY', 'BUD_NSY', 'AZM_MIL', 'BYND_NDQ', 'BKNG_NDQ', 'BPY_NDQ', 'SAM_NSY', 'BPYU_NDQ', 'CCL_NSY', 'CVX_NSY', 'KO_NSY', 'DOYU_NDQ', 'ENI_MIL', 'EQM_NSY', 'AIGE_MIL', 'ENB_NSY', 'EPD_NSY', 'ETH_MIL', 'FILA_MIL', 'FCA_MIL', 'GPS_NSY', 'HPQ_NSY', 'HUYA_NSY', 'HASI_NSY', 'HPE_NSY', 'HRL_NSY', 'IRC_MIL', 'IUKD_MIL', 'INTC_NDQ', 'MSFT_NDQ', 'NEXI_MIL', 'NNN_NSY', 'OXY_NSY', 'PIRC_MIL', 'PVH_NSY', 'PYPL_NDQ', 'PBR_NSY', 'PIA_MIL', 'REY_MIL', 'SNE_NSY', 'SRNE_NDQ', 'TSLA_NDQ', 'TM_NSY', 'TRIP_NDQ', 'UAA_NSY', 'UNH_NSY', 'ESPO_MIL', 'WM_NSY', 'WRK_NSY'])