In [None]:
# to do: fill NaN using the values from the last vintage date

In [1]:
import pathlib, warnings
import pandas as pd
import numpy as np
import datetime as datetime

In [9]:
data_path = pathlib.Path('raw_variables')
assert data_path.exists()

In [50]:
vintage_date = '2011-04-01' # until the end of day
start_quarter = '1981Q1' # inclusive
end_quarter = '2010Q2' # inclusive

vintage_date = pd.to_datetime(vintage_date)
start_observation = pd.to_datetime(start_quarter).to_period('Q').start_time
end_observation = pd.to_datetime(end_quarter).to_period('Q').end_time

if vintage_date < end_observation:
    warnings.warn('Your vintage date is older than the last observation date, so the last observation date will be the vintage date.')
    end_observation = vintage_date

fillna_from_last_vintage = False
variables = {
    'raw_variables': set([]),
    'raw_variables_transform': set([]),
    'observed_variables': set(['xgdp_obs', 'pgdp_obs', 'ffr_obs', 'fpi_obs', 'pce_obs', 'wage_obs']),
}

In [None]:
description_obs = pd.read_excel('observed_variable_description.xlsx')
description_raw = pd.read_csv('raw_variable_description.csv')

### generate observables

In [51]:
if len(variables['observed_variables']) > 0:

    # identify corresponding raw variables and store their names in 'raw_variables_tranform'
    set_raw_variables = {data_file.stem for data_file in data_path.glob('*.*')}
    description_obs = description_obs[description_obs['id'].map(lambda x: x in variables['observed_variables'])]
    
    idenfity_string = ' '.join(description_observed_variable['construction'].values)
    for variable in set_raw_variables:
        if variable in idenfity_string:
            variables['raw_variables_transform'].update({variable})

In [57]:
def generate_raw_variables(variables):

    index = -1

    for variable in variables:

        to_merge = False

        for data_file in set(data_path.glob('*.*')):

            if data_file.stem == variable:

                # load dataset
                data_set = pd.read_csv(data_file, index_col=0)
                data_set.index = pd.to_datetime(data_set.index)

                # load frequency of the variable
                frequency = description_raw[description_raw['id']==variable]['frequency_short'].values[0]

                # choose data within observation period
                observation_dates = data_set.index.map(lambda x: start_observation <= x <= end_observation)

                if len(observation_dates.values) == 0:
                    warnings.warn(f'\n{variable} has no value within the observation period you choose for any vintage date!\n')
                    break

                # choose data within vintage date
                vintage_column = ''

                if frequency == 'D':
                    vintage_column = data_set.columns.values[-1]
                else:
                    for column in data_set.columns.values:
                        if int(vintage_date.strftime('%Y%m%d')) >= int(column[-8:]):
                            vintage_column = column
                        else:
                            break

                if vintage_column == '':
                    warnings.warn(f'\nFor {variable}, the vintage date you choose is out of bound!\n')
                    break

                # combine desired observation period and vintage date
                # for daily and monthly data, take the average over the quarter
                if len(observation_dates.values) > 0 and vintage_column != '':

                    data_set['quarter'] = data_set.index.to_period('Q').values
                    data_set = data_set[observation_dates][[vintage_column, 'quarter']].copy()

                    if np.sum(~np.isnan(data_set[vintage_column].values)) == 0:
                        warnings.warn(f'\n{variable} has no value in the observation period and vintage date you choose.\n')
                        break
                    else:
                        index += 1
                        to_merge = True
                        data_to_merge = data_set.groupby('quarter').mean()

                break

        if to_merge == True:

            if index == 0:
                data_output = data_to_merge.copy()
            else:
                data_output = pd.merge(data_output, data_to_merge, how='outer', left_index=True, right_index=True, sort=True)

    return data_output

In [58]:
output = generate_raw_variables(variables['raw_variables_transform'])

In [60]:
if 'xgdp_obs' in variables['observed_variables']:
    

{1}


In [None]:
output.loc[:, 'xgdp_obs'] = output['GDPC1_20110325'].values

In [62]:
np.log(output['GDPC1_20110325'].values/output['GDPC1_20110325'].shift().values)*100

array([            nan, -8.00773285e-01,  1.20788923e+00, -1.25321824e+00,
       -1.65592487e+00,  5.41443807e-01, -3.86202683e-01,  7.83819285e-02,
        1.23573303e+00,  2.22275738e+00,  1.95361037e+00,  2.04589068e+00,
        1.92116384e+00,  1.71130953e+00,  9.66410968e-01,  8.11997905e-01,
        9.39072010e-01,  8.42104019e-01,  1.55030549e+00,  7.56169094e-01,
        9.57177860e-01,  4.00745182e-01,  9.59781538e-01,  4.82050203e-01,
        5.52056273e-01,  1.05791825e+00,  8.64264630e-01,  1.69571221e+00,
        5.16230267e-01,  1.27640157e+00,  5.14943667e-01,  1.32631564e+00,
        9.34333657e-01,  7.45300675e-01,  7.90257652e-01,  2.17992664e-01,
        1.03930213e+00,  3.96586647e-01, -1.24076406e-03, -8.79844164e-01,
       -4.85599263e-01,  6.71934332e-01,  4.20166066e-01,  3.91146930e-01,
        1.09204492e+00,  1.05720444e+00,  1.02693723e+00,  1.04620455e+00,
        1.83981772e-01,  6.37159418e-01,  5.25342768e-01,  1.31239911e+00,
        9.68249015e-01,  

In [69]:
data_output.to_csv(f"data_{vintage_date.strftime('%Y%m%d')}.csv")