In [76]:
variables = [
     #'GDPC1', 'GDPCTPI', 'PCEC', 'FPI', 'PRFI', 'PNFI', 'PCND', 'PCESV', 'PCDG', 'PCNDGC96', 'PCESVC96', 'PCDGCC96', 'COMPNFB', 'PRS85006023', 'BOGZ1FL144104005Q', 'HMLBSHNO', 'A007RD3Q086SBEA', 'A006RD3Q086SBEA', 'GPDIC1', 'GPDI', 'HOANBS', 'NETEXP', 'NETEXC', 'TOTLQ', 'PNFIC1', 'PRFIC1', 'IPDNBS', 'DNDGRD3Q086SBEA', 'DSERRD3Q086SBEA', 'DDURRD3Q086SBEA', 'GCEC1', 'COMPRNFB',   # quarterly
     #'CE16OV', 'CNP16OV', 'AWHNONAG', 'UNRATE', 'CPIAUCSL', 'PCEC96', 'PCE', 'PCENDC96', 'PCEDG', 'PCEDGC96', 'PCES', 'PCESC96', 'CLF16OV', 'BAA', 'TB3MS', 'PAYEMS', 'USCONS', 'AWHMAN', 'AWHAECON', 'CES2000000008', 'CPILFESL', 'EMRATIO', 'CIVPART', 'JTSJOL',  # monthly
     #'DFF', 'DBAA', 'DGS10', 'DTB3', 'WILL5000IND', # daily
]
api = '373b8581900f3b2c94da355762d31d7f',
start_date = '1960-01-01',
end_date = '2020-05-12',

In [77]:
import requests, json, warnings, pathlib, time
import pandas as pd
from tqdm import tqdm_notebook
from datetime import datetime

warnings.simplefilter('always')

In [78]:
params = {
    'api_key': api,
    'file_type': 'json',
    'observation_start': start_date, # date of interest
    'realtime_start': start_date, # start of a period (publication date)
    'realtime_end': end_date, # end of a period (one day before the next publication date)
    }

description_keys = [
    'id', 
    'title', 
    'frequency', 
    'frequency_short', 
    'units',
    'units_short',
    'seasonal_adjustment',
    'seasonal_adjustment_short',
    'notes',
    'observation_start',
    'observation_end',
]

timeout = 10

In [79]:
def download_page(url, params):
    '''Download page from ALFRED and check whether the download succeeds'''
    page = requests.get(url, params=params, timeout=timeout)
    if page.status_code == 429:
        time.sleep(20)
        page = requests.get(url, params=params, timeout=timeout)
    assert page.status_code == 200, f"No {params['series_id']} from {url}, {page.status_code} error"
    return page

In [80]:
def float_or_nan(x):
    '''Convert a string to either a float number or NaN'''
    try:
        return float(x)
    except:
        return float('nan')

### Retrieve data descriptions

In [81]:
descriptions = []

for variable in variables:
    
    params.update({'series_id': variable})
    description = dict()

    # retrieve the basic description (if exists; if multiple descriptions, retreive the last only)
    page = download_page(url='https://api.stlouisfed.org/fred/series', params=params)
    page = page.json()['seriess'][-1]
    description = {key: page.get(key, '') for key in description_keys}

    # retreive the release (if exists)
    page = download_page(url='https://api.stlouisfed.org/fred/series/release', params=params)
    page = page.json()['releases'][-1]
    release_id = page['id']
    description['release'] = page.get('name', '')
    description['release_url'] = page.get('link', '')
    
    # retreive the source (if exists)
    params_source = params
    page = requests.get(
        url='https://api.stlouisfed.org/fred/release/sources', 
        params={'api_key': api, 'file_type': 'json', 'release_id': release_id,}
        )
    page = page.json()['sources'][-1]
    description['source'] = page.get('name', '')
    description['source_url'] = page.get('link', '')

    # for variables updated in daily frequency, download the last vintage only
    if description['frequency_short'] == 'D':
        params.update({'realtime_start': end_date})
    else:
        params.update({'realtime_start': start_date})

    # retreive the vintage dates (if available)
    page = download_page(url='https://api.stlouisfed.org/fred/series/vintagedates', params=params)
    vintage_dates = page.json()['vintage_dates']
    description['numberof_vintage_dates'] = len(vintage_dates)
    description['is_revised'] = len(vintage_dates) > 1
    description['vintage_dates'] = vintage_dates
    
    descriptions.append(description)

### Save data descriptions to disk

In [82]:
# with open('raw_variables_descriptions.txt', 'w') as json_file:
#     json.dump(descriptions, json_file, indent=4)

try: 

    pd_descriptions_old = pd.read_csv('raw_variable_description.csv')
    old_variable_set = pd_descriptions_old['id'].to_list()

    pd_descriptions_new = pd.DataFrame(descriptions)
    new_variable_set = pd_descriptions_new['id'].to_list()

    # check whether new variables already exist
    duplicated_variables = [variable for variable in new_variable_set if variable in old_variable_set]
    if len(duplicated_variables) > 0:
        warnings.warn('\nWARNING: Some new raw variables already exist and their old information will be removed!\n')

    # drop duplicated variables
    pd_descriptions_old.drop(
        pd_descriptions_old.index[pd_descriptions_old['id'].map(lambda x: x in duplicated_variables)],
        inplace = True
        )

    # concatenate and save to disk
    pd_descriptions = pd.concat([pd_descriptions_old, pd_descriptions_new])
    pd_descriptions.to_csv('raw_variable_description.csv', index=False)

except:

    pd.DataFrame(descriptions).to_csv('raw_variable_description.csv', index=False)


  from ipykernel import kernelapp as app


### Download and re-organize raw data

In [83]:
for variable in variables:

    # for variables updated in daily frequency, download the last vintage only
    if pd_descriptions[pd_descriptions['id']==variable]['frequency_short'].values[0] == 'D':
        params.update({'realtime_start': end_date})
    else:
        params.update({'realtime_start': start_date})

    # download data from ALFRED
    params.update({'series_id': variable})
    page = download_page(url='https://api.stlouisfed.org/fred/series/observations', params=params)

    # convert data type from JSON -> DataFrame
    # convert values from string -> float, convert dates from string -> datetime
    data = pd.DataFrame(page.json()['observations'])
    data['value'] = data['value'].map(lambda x: float_or_nan(x))
    for column in data.columns:
        if column != 'value':
            data[column] = pd.to_datetime(data[column])

    # collect all vintage dates
    vintage_dates = sorted(list(set(data['realtime_start'].to_list())))

    # reshape data structure to ['observation_date', 'VarName_VintDate1', 'VarName_VintDate2', ...]
    for index, (observation_date, group) in tqdm_notebook(enumerate(data.groupby('date'))):

        if index == 0:

            temp_values = {'observation_date': observation_date.strftime('%Y-%m-%d')}

            for vintage_date in vintage_dates:
                found_value = False
                for _, row in group.drop('date', axis=1).iterrows():
                    if row['realtime_start'] <= vintage_date <= row['realtime_end']:
                        temp_values[f"{variable}_{vintage_date.strftime('%Y%m%d')}"] = row['value']
                        found_value = True
                        break
                if found_value == False:
                    temp_values[f"{variable}_{vintage_date.strftime('%Y%m%d')}"] = float('nan')
            assert len(temp_values) == len(vintage_dates) + 1

            data_output = pd.DataFrame(temp_values, index=[0])

        else:

            temp_values = [observation_date.strftime('%Y-%m-%d')]

            for vintage_date in vintage_dates:
                found_value = False
                for _, row in group.drop('date', axis=1).iterrows():
                    if row['realtime_start'] <= vintage_date <= row['realtime_end']:
                        temp_values.append(row['value'])
                        found_value = True
                        break
                if found_value == False:
                    temp_values.append(float('nan'))
            assert len(temp_values) == len(vintage_dates) + 1

            data_output.loc[data_output.shape[0]] = temp_values

    data_output.set_index('observation_date', inplace=True)
    data_output.to_csv(f'raw/alfred/{variable}.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


