In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

In [2]:
data = pd.read_csv('../data/dataformodel.csv')

In [3]:
data.head(15)

Unnamed: 0,date,pce,inflation,unemployment_rate,revolving_credit,nonrevolving_credit,total_credit
0,1959-01-01,306.1,29.01,6.0,,48961.16,48961.16
1,1959-02-01,309.6,29.0,5.9,,49513.71,49513.71
2,1959-03-01,312.7,28.97,5.6,,50007.73,50007.73
3,1959-04-01,312.2,28.98,5.2,,50463.43,50463.43
4,1959-05-01,316.1,29.04,5.1,,51007.24,51007.24
5,1959-06-01,318.2,29.11,5.0,,51675.44,51675.44
6,1959-07-01,317.8,29.15,5.1,,52356.85,52356.85
7,1959-08-01,320.2,29.18,5.2,,53038.53,53038.53
8,1959-09-01,324.2,29.25,5.5,,53683.75,53683.75
9,1959-10-01,322.8,29.35,5.7,,54365.95,54365.95


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 796 entries, 0 to 795
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 796 non-null    object 
 1   pce                  796 non-null    float64
 2   inflation            796 non-null    float64
 3   unemployment_rate    796 non-null    float64
 4   revolving_credit     688 non-null    float64
 5   nonrevolving_credit  796 non-null    float64
 6   total_credit         796 non-null    float64
dtypes: float64(6), object(1)
memory usage: 43.7+ KB


In [5]:
# converting date to datetime format
data['date'] = pd.to_datetime(data['date'])

In [6]:
# variable to how many months in advance to predict 
months_in_adv = 3

In [7]:
# added column for future PCE values to make the target variable
data['pce_future'] = data['pce'].shift(-months_in_adv)

In [8]:
# filling in NaN values with 0 for revolving credit where there is no data for the first 10ish years 
data['revolving_credit'] = data['revolving_credit'].fillna(0)

In [9]:
features_to_lag = [
    'pce',
    'inflation',
    'unemployment_rate',
    'revolving_credit',
    'nonrevolving_credit',
    'total_credit'
]

In [10]:
# adding additional lag feature columns, using opposite shift and each month going back months_in_adv variable (so if set at 3, it will have 1, 2 ,3 months back)

def add_lag_features(df, months_lag, features):
    df_copy = df.copy()
    for feature in features:
        for i in range(1, months_lag + 1):
            df[f'{feature}_lag_{i}'] = df[feature].shift(i)
    return df_copy

In [13]:
data_with_lag = add_lag_features(data, months_in_adv, features_to_lag)

In [14]:
data_with_lag.head()

Unnamed: 0,date,pce,inflation,unemployment_rate,revolving_credit,nonrevolving_credit,total_credit,pce_future,pce_lag_1,pce_lag_2,...,unemployment_rate_lag_3,revolving_credit_lag_1,revolving_credit_lag_2,revolving_credit_lag_3,nonrevolving_credit_lag_1,nonrevolving_credit_lag_2,nonrevolving_credit_lag_3,total_credit_lag_1,total_credit_lag_2,total_credit_lag_3
0,1959-01-01,306.1,29.01,6.0,0.0,48961.16,48961.16,312.2,,,...,,,,,,,,,,
1,1959-02-01,309.6,29.0,5.9,0.0,49513.71,49513.71,316.1,306.1,,...,,0.0,,,48961.16,,,48961.16,,
2,1959-03-01,312.7,28.97,5.6,0.0,50007.73,50007.73,318.2,309.6,306.1,...,,0.0,0.0,,49513.71,48961.16,,49513.71,48961.16,
3,1959-04-01,312.2,28.98,5.2,0.0,50463.43,50463.43,317.8,312.7,309.6,...,6.0,0.0,0.0,0.0,50007.73,49513.71,48961.16,50007.73,49513.71,48961.16
4,1959-05-01,316.1,29.04,5.1,0.0,51007.24,51007.24,320.2,312.2,312.7,...,5.9,0.0,0.0,0.0,50463.43,50007.73,49513.71,50463.43,50007.73,49513.71


In [15]:
# dropping rows with nan values (should be only first and last rows depending on the months_in_adv variable)
data_with_lag = data_with_lag.dropna()

In [16]:
data_with_lag.head()

Unnamed: 0,date,pce,inflation,unemployment_rate,revolving_credit,nonrevolving_credit,total_credit,pce_future,pce_lag_1,pce_lag_2,...,unemployment_rate_lag_3,revolving_credit_lag_1,revolving_credit_lag_2,revolving_credit_lag_3,nonrevolving_credit_lag_1,nonrevolving_credit_lag_2,nonrevolving_credit_lag_3,total_credit_lag_1,total_credit_lag_2,total_credit_lag_3
3,1959-04-01,312.2,28.98,5.2,0.0,50463.43,50463.43,317.8,312.7,309.6,...,6.0,0.0,0.0,0.0,50007.73,49513.71,48961.16,50007.73,49513.71,48961.16
4,1959-05-01,316.1,29.04,5.1,0.0,51007.24,51007.24,320.2,312.2,312.7,...,5.9,0.0,0.0,0.0,50463.43,50007.73,49513.71,50463.43,50007.73,49513.71
5,1959-06-01,318.2,29.11,5.0,0.0,51675.44,51675.44,324.2,316.1,312.2,...,5.6,0.0,0.0,0.0,51007.24,50463.43,50007.73,51007.24,50463.43,50007.73
6,1959-07-01,317.8,29.15,5.1,0.0,52356.85,52356.85,322.8,318.2,316.1,...,5.2,0.0,0.0,0.0,51675.44,51007.24,50463.43,51675.44,51007.24,50463.43
7,1959-08-01,320.2,29.18,5.2,0.0,53038.53,53038.53,322.9,317.8,318.2,...,5.1,0.0,0.0,0.0,52356.85,51675.44,51007.24,52356.85,51675.44,51007.24


In [17]:
data_with_lag.tail()

Unnamed: 0,date,pce,inflation,unemployment_rate,revolving_credit,nonrevolving_credit,total_credit,pce_future,pce_lag_1,pce_lag_2,...,unemployment_rate_lag_3,revolving_credit_lag_1,revolving_credit_lag_2,revolving_credit_lag_3,nonrevolving_credit_lag_1,nonrevolving_credit_lag_2,nonrevolving_credit_lag_3,total_credit_lag_1,total_credit_lag_2,total_credit_lag_3
788,2024-09-01,20044.1,314.851,4.1,1360370.46,3733724.31,5094094.77,20408.1,19905.0,19866.3,...,4.1,1357981.88,1357350.28,1349803.77,3731895.86,3724143.76,3713982.9,5089877.75,5081494.04,5063786.67
789,2024-10-01,20123.2,315.564,4.1,1369397.34,3735086.52,5104483.86,20389.0,20044.1,19905.0,...,4.2,1360370.46,1357981.88,1357350.28,3733724.31,3731895.86,3724143.76,5094094.77,5089877.75,5081494.04
790,2024-11-01,20235.1,316.449,4.2,1358585.49,3739805.47,5098390.96,20469.3,20123.2,20044.1,...,4.2,1369397.34,1360370.46,1357981.88,3735086.52,3733724.31,3731895.86,5104483.86,5094094.77,5089877.75
791,2024-12-01,20408.1,317.603,4.1,1316865.89,3671015.14,4987881.03,20621.8,20235.1,20123.2,...,4.1,1358585.49,1369397.34,1360370.46,3739805.47,3735086.52,3733724.31,5098390.96,5104483.86,5094094.77
792,2025-01-01,20389.0,319.086,4.0,1323514.25,3673358.72,4996872.97,20669.5,20408.1,20235.1,...,4.1,1316865.89,1358585.49,1369397.34,3671015.14,3739805.47,3735086.52,4987881.03,5098390.96,5104483.86


In [18]:
X = data.drop(columns=['pce_future'])
y = data['pce_future']