In [215]:
%reload_ext autoreload

# imports
import boto3
import pandas as pd
import numpy as np
import json, time, random
import os.path, urllib
from io import StringIO
from urllib.request import urlopen
from pandas.io.json import json_normalize
from datetime import datetime, date

In [216]:
# config and mappings
conf_file = 'config.json'
config = load_config(conf_file)

UNIVERSE = config['symbols']
BUCKET_NAME = config["bucket_name"]
MIN_MAX_SLEEP = config["min_max_sleep"]
MAX_SYMBOLS = config["max_symbols_request"]
S3_STORE = config['s3_store']
fname = config['filename_fmt']

s3 = boto3.resource('s3', 'us-west-2')
bucket = s3.Bucket(BUCKET_NAME)

url_key = 'url'
enc_key = 'enc_key'
enc_val = 'enc_val'
storage_path = 'store_path'

query_map = {
    'summary': {
        url_key:'https://query1.finance.yahoo.com/v10/finance/quoteSummary/{0}?formatted=true&lang=en-US&region=US&{1}&corsDomain=finance.yahoo.com',
        enc_key: 'modules',
        enc_val:'defaultKeyStatistics,assetProfile,financialData,balanceSheetHistory,balanceSheetHistoryQuarterly,cashflowStatementHistory,cashflowStatementHistoryQuarterly,incomeStatementHistory,incomeStatementHistoryQuarterly,calendarEvents,earnings,earningsHistory,earningsTrend,recommendationTrend,upgradeDowngradeHistory,indexTrend,fundOwnership,insiderHolders,institutionOwnership,majorDirectHolders,majorHoldersBreakdown,netSharePurchaseActivity'
    },
    'option': {
        url_key:'https://query1.finance.yahoo.com/v7/finance/options/{0}?formatted=true&lang=en-US&region=US&straddle=false&{1}&corsDomain=finance.yahoo.com',
        enc_key: 'date'
    },
    'quote':{
        url_key:'https://query1.finance.yahoo.com/v7/finance/quote?formatted=true&lang=en-US&region=US&{0}&corsDomain=finance.yahoo.com',
        enc_key: 'symbols'
    }
}

Loading config.json


In [217]:
S3_STORE

True

In [223]:
# data gathering and storage functions
comma_join = lambda x, y: x + ',' + y
getChildrenList = lambda tempResult, parent: tempResult[parent]['result']

def save_config(config, fname):
    with open(fname, 'w') as file:
        data = json.dumps(config, indent=1)
        file.write(data)
        file.close()
        print('Saving', fname)

def load_config(fname):
    with open(fname, 'r') as file:
        data = file.read()
        file.close()
        print('Loading', fname)
        return json.loads(data)

def run_sleeper(min_s, max_s):
    sleep_time = random.randint(min_s, max_s)
    time.sleep(sleep_time)
    
def url_open(url):
    run_sleeper(MIN_MAX_SLEEP[0], MIN_MAX_SLEEP[1])
    usock = urlopen(url)
    data = usock.read()
    usock.close()
    return data

def get_data(symbol, dataset, encoded_params):
    url = query_map[dataset][url_key]
    data = url_open(url.format(symbol, encoded_params))
    return data

def save_to_file(data, path, fname):
    
    if (S3_STORE):
        file_key = path + fname
        print(file_key)
        bucket.put_object(Body=data, Key=file_key)
        return
    
    write_method = 'w' + ('b' if type(data) is bytes else '')
    bucket_path = './' + BUCKET_NAME + '/'
    try: file = open(bucket_path + path + fname, write_method)
    except FileNotFoundError:
        os.mkdir(bucket_path + path)
        file = open(bucket_path + path + fname, write_method)
    print('Saving', fname)
    file.write(data)
    file.close()

def get_grouped_ds(symbol, dataset):
    # bulks download of all description modules
    # can be streamlined to avoid request information that does not change often
    key, value = query_map[dataset][enc_key], query_map[dataset][enc_val]
    encoded_kv = urllib.parse.urlencode({key: value})
    data = get_data(symbol, dataset, encoded_kv)
    full_data = getChildrenList(json.loads(data), 'quoteSummary')
    data = json.dumps(full_data)
    path = get_storage_path(dataset).format(str(date.today()))
    save_to_file(data, path, fname.format(symbol))

def get_cs_tickers(ticker_list):
    cs_tickers = ticker_list[0]
    if len(ticker_list) > 1:
        for t in ticker_list[1:]: cs_tickers = comma_join(cs_tickers, t)
    return cs_tickers
    
def get_quotes(symbol_list):
    dataset = 'quote'
    full_data = []
    index, max_elems = 0, MAX_SYMBOLS
    for q in range(int(len(symbol_list) / max_elems) + 1):
        subset = symbol_list[index:index + max_elems]
        index += max_elems
        symbols = get_cs_tickers(subset)
        encoded_kv = urllib.parse.urlencode({query_map[dataset][enc_key]: symbols})
        data = get_data(encoded_kv, dataset, '')
        full_data.extend(getChildrenList(json.loads(data), 'quoteResponse'))
    data = json.dumps(full_data)
    save_to_file(data, get_storage_path(dataset), fname.format(str(date.today())))    
    
def get_options(symbol):
    # save all options expirations dates to files for a given company
    dataset = 'option'
    print('Getting options expirations for', symbol)
    key = query_map[dataset][enc_key]
    encoded_kv = urllib.parse.urlencode({key: 0})
    data = get_data(symbol, dataset, encoded_kv) # first expiration no date
    json_dict = json.loads(data)
    option_chain = json_dict['optionChain']['result'][0]
    exp_dates = option_chain['expirationDates']
    today_date = str(date.today())
    full_data = []
    for ed in exp_dates:
        encoded_kv = urllib.parse.urlencode({query_map[dataset][enc_key]: ed})
        data = get_data(symbol, dataset, encoded_kv)
        full_data.extend(getChildrenList(json.loads(data), 'optionChain'))
    data = json.dumps(full_data)
    path = get_storage_path(dataset)
    save_to_file(data, path.format(today_date), fname.format(symbol))

def get_storage_path(dataset):
    return config[dataset + '_path']

In [224]:
%time get_grouped_ds('FB', 'summary')
# %time get_quotes(['FB'])
# %time get_options('FB')

summary/2018-09-11/FB.json
CPU times: user 46.5 ms, sys: 11.8 ms, total: 58.3 ms
Wall time: 4.3 s


In [65]:
# %time for t in UNIVERSE: get_grouped_ds(t, 'summary')
# %time get_quotes(UNIVERSE)
# %time for t in UNIVERSE: get_options(t)

### S3 storage scripts

In [218]:
def update_s3():
    files = !find {BUCKET_NAME} | grep .json
    local_files = [f[f.find('/') + 1:] for f in files ]
    s3_objs = [x.key for x in bucket.objects.all()]
    missing_s3 = set(local_files).difference(set(s3_objs))
    print('Missing in S3 {0}, in S3 {1}'.format(len(missing_s3), len(local_files) - len(missing_s3)))
    for file_key in list(missing_s3):
        rootpath = './' + BUCKET_NAME + '/'
        data = open(rootpath + file_key, 'rb')
        print('Putting', file_key)
        bucket.put_object(Body=data, Key=file_key)

In [219]:
# update_s3()

Missing in S3 0, in S3 793
