In [1]:
# Due to the file system setting of the Jupyter Notebook, navigate to the package to use infra
%cd /home/jupyter/quant

/home/jupyter/quant


### This script is used to update the data for the most recent year

#### Set the required paramters of `extraction.py`:

Reference of required parameters can be found in `init.py` files in the package directory under `yitian`

| parameter     | example          |  description                             |
|---------------|------------------|------------------------------------------|
| year          | 2020             | the target year for data extraction      |
| db_name       | 'NASDAQOMX'      | the data base code from quandl           |
| ds_name       | 'XQC'            | the data set code from quandl            |
| output_dw_dir | 'commodity/opec' | the sub-dir in data warehouse for output |
| rm_exist_obj  | True             | remove objects in output_dw_dir          |

In [2]:
from datetime import datetime

from yitian.datasource import DATA_WAREHOUSE_LOC, file_utils
from yitian.datasource.quandl import nasdaq, opec, fed

#### The following setting updates NASDAQ OMX index in 2020

In [3]:
# Recently executed on
print(datetime.now())

year = 2020
db_name = nasdaq.NASDAQ_DATABASE_CODE
ds_name = nasdaq.DATASET_CODE_MAP['settlement_value']
output_dw_dir = 'equity/nasdaq/settlement_value'
rm_exist_obj = True

%run -i 'notebooks/etl/quandl/extraction.py'

2020-03-15 02:30:47.537236
The start date is set to 2020-01-01 & The end date is set to 2020-12-31
('id', 41168791)
('dataset_code', 'XQC')
('database_code', 'NASDAQOMX')
('name', 'NASDAQ-100 PM Settlement Value (XQC)')
('description', 'For detailed information, see <a href=https://indexes.nasdaqomx.com/Index/Overview/XQC>https://indexes.nasdaqomx.com/Index/Overview/XQC</a>')
('refreshed_at', '2020-03-14T05:08:00.380Z')
('newest_available_date', '2020-03-13')
('oldest_available_date', '2016-12-19')
('column_names', ['Trade Date', 'Index Value', 'High', 'Low', 'Total Market Value', 'Dividend Market Value'])
('frequency', 'daily')
('type', 'Time Series')
('premium', False)
('database_id', 10471)


Removing all objects in a sub-directory


Command '['gsutil', 'rm', 'gs://zhongyuan-dw/equity/nasdaq/settlement_value/2020/*']' returned non-zero exit status 1.
2020-01-01_2020-03-13_daily.csv has been overwrite to gs://zhongyuan-dw/equity/nasdaq/settlement_value/2020/2020-01-01_2020-03-13_daily.csv


#### The following setting updates OPEC Oil Price in 2020

In [4]:
# Recently executed on
print(datetime.now())

year = 2020
db_name = opec.OPEC_DATABASE_CODE
ds_name = opec.OPEC_DATASET_CODE
output_dw_dir = 'commodity/opec'
rm_exist_obj = True

%run -i 'notebooks/etl/quandl/extraction.py'

2020-03-15 02:30:56.160143
The start date is set to 2020-01-01 & The end date is set to 2020-12-31
('id', 2288715)
('dataset_code', 'ORB')
('database_code', 'OPEC')
('name', 'OPEC Crude Oil Price')
('description', 'Reference Price for the OPEC Crude Oil Basket.  Currently includes: Saharan Blend (Algeria), Girassol (Angola), Oriente (Ecuador), Iran Heavy (Islamic Republic of Iran), Basra Light (Iraq), Kuwait Export (Kuwait), Es Sider (Libya), Bonny Light (Nigeria), Qatar Marine (Qatar), Arab Light (Saudi Arabia), Murban (UAE) and Merey (Venezuela).')
('refreshed_at', '2020-03-13T12:32:05.707Z')
('newest_available_date', '2020-03-12')
('oldest_available_date', '2003-01-02')
('column_names', ['Date', 'Value'])
('frequency', 'daily')
('type', 'Time Series')
('premium', False)
('database_id', 381)


Removing all objects in a sub-directory


2020-01-01_2020-03-12_daily.csv has been overwrite to gs://zhongyuan-dw/commodity/opec/2020/2020-01-01_2020-03-12_daily.csv


#### The following setting updates US Fed rates in 2020

In [5]:
# Recently executed on
print(datetime.now())

year = 2020
db_name = fed.FED_DATABASE_CODE
rm_exist_obj = True

for name, ds in fed.DATASET_CODE_MAP.items():
    ds_name = ds
    output_dw_dir = 'interest_rate/fed/{name}'.format(name=name)
    
    %run -i 'notebooks/etl/quandl/extraction.py'

2020-03-15 02:30:59.208877
The start date is set to 2020-01-01 & The end date is set to 2020-12-31
('id', 14468012)
('dataset_code', 'SVENPY')
('database_code', 'FED')
('name', 'US Treasury Par Yield Curve')
('description', 'These yield curves are an off-the-run Treasury yield curve based on a large set of outstanding Treasury notes and bonds, and are based on a coupon-equivalent compounding convention. Values are daily estimates of the yield curve from 1961 for the entire maturity range spanned by outstanding Treasury securities.<br><br>More detailed information is available at <a href=http://www.federalreserve.gov/pubs/feds/2006/200628/200628pap.pdf>http://www.federalreserve.gov/pubs/feds/2006/200628/200628pap.pdf</a>')
('refreshed_at', '2020-03-13T04:06:13.015Z')
('newest_available_date', '2020-03-06')
('oldest_available_date', '1961-06-14')
('column_names', ['Date', 'SVENPY01', 'SVENPY02', 'SVENPY03', 'SVENPY04', 'SVENPY05', 'SVENPY06', 'SVENPY07', 'SVENPY08', 'SVENPY09', 'SVENPY10

Removing all objects in a sub-directory


2020-01-01_2020-03-06_daily.csv has been overwrite to gs://zhongyuan-dw/interest_rate/fed/us_trsy_par_yc/2020/2020-01-01_2020-03-06_daily.csv
The start date is set to 2020-01-01 & The end date is set to 2020-12-31
('id', 14468014)
('dataset_code', 'SVENY')
('database_code', 'FED')
('name', 'US Treasury Zero-Coupon Yield Curve')
('description', 'These yield curves are an off-the-run Treasury yield curve based on a large set of outstanding Treasury notes and bonds, and are based on a continuous compounding convention. Values are daily estimates of the yield curve from 1961 for the entire maturity range spanned by outstanding Treasury securities.<br><br>More detailed information is available at <a href=http://www.federalreserve.gov/pubs/feds/2006/200628/200628pap.pdf>http://www.federalreserve.gov/pubs/feds/2006/200628/200628pap.pdf</a>')
('refreshed_at', '2020-03-13T04:06:12.992Z')
('newest_available_date', '2020-03-06')
('oldest_available_date', '1961-06-14')
('column_names', ['Date', 'S

Removing all objects in a sub-directory


2020-01-01_2020-03-06_daily.csv has been overwrite to gs://zhongyuan-dw/interest_rate/fed/us_trsy_zero_coupon_yc/2020/2020-01-01_2020-03-06_daily.csv
The start date is set to 2020-01-01 & The end date is set to 2020-12-31
('id', 23761126)
('dataset_code', 'TIPSY')
('database_code', 'FED')
('name', 'TIPS Yield Curve and Inflation Compensation')
('description', 'Federal Reserve yield curve of index-linked debt. Daily frequency. More detailed information available at http://www.federalreserve.gov/pubs/feds/2008/200805/200805abs.html')
('refreshed_at', '2020-03-13T04:10:22.903Z')
('newest_available_date', '2019-10-25')
('oldest_available_date', '1999-01-04')
('column_names', ['Date', 'TIPSY02', 'TIPSY03', 'TIPSY04', 'TIPSY05', 'TIPSY06', 'TIPSY07', 'TIPSY08', 'TIPSY09', 'TIPSY10', 'TIPSY11', 'TIPSY12', 'TIPSY13', 'TIPSY14', 'TIPSY15', 'TIPSY16', 'TIPSY17', 'TIPSY18', 'TIPSY19', 'TIPSY20', 'TIPSPY02', 'TIPSPY03', 'TIPSPY04', 'TIPSPY05', 'TIPSPY06', 'TIPSPY07', 'TIPSPY08', 'TIPSPY09', 'TIPSP

ValueError: User defined end date 2020-12-31 needs to be smaller than the newest available date 2019-10-25