In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb


In [2]:
data = pd.read_csv('../data/dataformodel.csv')

In [3]:
# data.head(15)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 796 entries, 0 to 795
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 796 non-null    object 
 1   pce                  796 non-null    float64
 2   inflation            796 non-null    float64
 3   unemployment_rate    796 non-null    float64
 4   revolving_credit     688 non-null    float64
 5   nonrevolving_credit  796 non-null    float64
 6   total_credit         796 non-null    float64
dtypes: float64(6), object(1)
memory usage: 43.7+ KB


In [5]:
# converting date to datetime format
data['date'] = pd.to_datetime(data['date'])

In [6]:
# variable to how many months in advance to predict 
months_in_adv = 6

In [7]:
# added column for future PCE values to make the target variable
data['pce_future'] = data['pce'].shift(-months_in_adv)

In [8]:
# filling in NaN values with 0 for revolving credit where there is no data for the first 10ish years 
data['revolving_credit'] = data['revolving_credit'].fillna(0)

In [9]:
features_to_lag = [
    'pce',
    'inflation',
    'unemployment_rate',
    'revolving_credit',
    'nonrevolving_credit',
    'total_credit'
]

In [10]:
# adding additional lag feature columns, using opposite shift and each month going back months_in_adv variable (so if set at 3, it will have 1, 2 ,3 months back)

def add_lag_features(df, months_lag, features):
    df_copy = df.copy()
    for feature in features:
        for i in range(1, months_lag + 1):
            df_copy[f'{feature}_lag_{i}'] = df_copy[feature].shift(i)
    return df_copy

In [11]:
data_with_lag = add_lag_features(data, months_in_adv, features_to_lag)

In [12]:
# data_with_lag.head()

In [13]:
# dropping rows with nan values (should be only first and last rows depending on the months_in_adv variable)
data_with_lag = data_with_lag.dropna()

In [14]:
data_with_lag.head()

Unnamed: 0,date,pce,inflation,unemployment_rate,revolving_credit,nonrevolving_credit,total_credit,pce_future,pce_lag_1,pce_lag_2,...,nonrevolving_credit_lag_3,nonrevolving_credit_lag_4,nonrevolving_credit_lag_5,nonrevolving_credit_lag_6,total_credit_lag_1,total_credit_lag_2,total_credit_lag_3,total_credit_lag_4,total_credit_lag_5,total_credit_lag_6
6,1959-07-01,317.8,29.15,5.1,0.0,52356.85,52356.85,323.6,318.2,316.1,...,50463.43,50007.73,49513.71,48961.16,51675.44,51007.24,50463.43,50007.73,49513.71,48961.16
7,1959-08-01,320.2,29.18,5.2,0.0,53038.53,53038.53,325.3,317.8,318.2,...,51007.24,50463.43,50007.73,49513.71,52356.85,51675.44,51007.24,50463.43,50007.73,49513.71
8,1959-09-01,324.2,29.25,5.5,0.0,53683.75,53683.75,330.2,320.2,317.8,...,51675.44,51007.24,50463.43,50007.73,53038.53,52356.85,51675.44,51007.24,50463.43,50007.73
9,1959-10-01,322.8,29.35,5.7,0.0,54365.95,54365.95,336.5,324.2,320.2,...,52356.85,51675.44,51007.24,50463.43,53683.75,53038.53,52356.85,51675.44,51007.24,50463.43
10,1959-11-01,322.9,29.35,5.8,0.0,54794.81,54794.81,330.0,322.8,324.2,...,53038.53,52356.85,51675.44,51007.24,54365.95,53683.75,53038.53,52356.85,51675.44,51007.24


In [15]:
data_with_lag.tail()

Unnamed: 0,date,pce,inflation,unemployment_rate,revolving_credit,nonrevolving_credit,total_credit,pce_future,pce_lag_1,pce_lag_2,...,nonrevolving_credit_lag_3,nonrevolving_credit_lag_4,nonrevolving_credit_lag_5,nonrevolving_credit_lag_6,total_credit_lag_1,total_credit_lag_2,total_credit_lag_3,total_credit_lag_4,total_credit_lag_5,total_credit_lag_6
785,2024-06-01,19747.5,313.131,4.1,1349803.77,3713982.9,5063786.67,20408.1,19697.3,19603.3,...,3708478.82,3710302.34,3709765.71,3705570.14,5061886.0,5053175.29,5049502.05,5049279.65,5038739.97,5024351.3
786,2024-07-01,19866.3,313.566,4.2,1357350.28,3724143.76,5081494.04,20389.0,19747.5,19697.3,...,3709435.28,3708478.82,3710302.34,3709765.71,5063786.67,5061886.0,5053175.29,5049502.05,5049279.65,5038739.97
787,2024-08-01,19905.0,314.131,4.2,1357981.88,3731895.86,5089877.75,20469.3,19866.3,19747.5,...,3711153.84,3709435.28,3708478.82,3710302.34,5081494.04,5063786.67,5061886.0,5053175.29,5049502.05,5049279.65
788,2024-09-01,20044.1,314.851,4.1,1360370.46,3733724.31,5094094.77,20621.8,19905.0,19866.3,...,3713982.9,3711153.84,3709435.28,3708478.82,5089877.75,5081494.04,5063786.67,5061886.0,5053175.29,5049502.05
789,2024-10-01,20123.2,315.564,4.1,1369397.34,3735086.52,5104483.86,20669.5,20044.1,19905.0,...,3724143.76,3713982.9,3711153.84,3709435.28,5094094.77,5089877.75,5081494.04,5063786.67,5061886.0,5053175.29


In [16]:
X = data_with_lag.drop(columns=['date', 'pce_future'])
y = data_with_lag['pce_future']

In [17]:
print(X.shape)
print(y.shape)

(784, 42)
(784,)


In [18]:
test_size = 60

In [19]:
min_train_size = 60

In [20]:
total_months = len(data_with_lag)

In [21]:
max_splits = (total_months - min_train_size) // test_size
print(max_splits)

12


In [22]:
n_splits = max_splits

In [23]:
time_series_split = TimeSeriesSplit(
    n_splits = n_splits,
    test_size = test_size
)


In [24]:
mse_scores = []
r2_scores = []
test_results = []

print("*** Linear Regression Model ***")

for i, (train_index, test_index) in enumerate(time_series_split.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_dates_min = data_with_lag.iloc[train_index]['date'].min().strftime('%Y-%m')
    train_dates_max = data_with_lag.iloc[train_index]['date'].max().strftime('%Y-%m')
    test_dates_min = data_with_lag.iloc[test_index]['date'].min().strftime('%Y-%m')
    test_dates_max = data_with_lag.iloc[test_index]['date'].max().strftime('%Y-%m')
    test_dates = data_with_lag.iloc[test_index]['date']

    print(f"\n--- Split {i + 1} ---")
    print(f"  TRAIN: {len(train_index)} months ({train_dates_min} to {train_dates_max})")
    print(f"  TEST: {len(test_index)} months ({test_dates_min} to {test_dates_max})")


    model = LinearRegression() 
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)


    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mse_scores.append(mse)
    r2_scores.append(r2)

    split_tests = pd.DataFrame({
        'date': test_dates,
        'actual_pce': y_test,
        'predicted_pce': y_pred,
        'split': i + 1
    })
    test_results.append(split_tests)


    print("Model Performance:")
    print(f"    Mean Squared Error: {mse : .2f}")
    print(f"    R-squared: {r2 : .2f}")

*** Linear Regression Model ***

--- Split 1 ---
  TRAIN: 64 months (1959-07 to 1964-10)
  TEST: 60 months (1964-11 to 1969-10)
Model Performance:
    Mean Squared Error:  94.39
    R-squared:  0.97

--- Split 2 ---
  TRAIN: 124 months (1959-07 to 1969-10)
  TEST: 60 months (1969-11 to 1974-10)
Model Performance:
    Mean Squared Error:  1059.11
    R-squared:  0.91

--- Split 3 ---
  TRAIN: 184 months (1959-07 to 1974-10)
  TEST: 60 months (1974-11 to 1979-10)
Model Performance:
    Mean Squared Error:  1358.61
    R-squared:  0.97

--- Split 4 ---
  TRAIN: 244 months (1959-07 to 1979-10)
  TEST: 60 months (1979-11 to 1984-10)
Model Performance:
    Mean Squared Error:  4213.82
    R-squared:  0.94

--- Split 5 ---
  TRAIN: 304 months (1959-07 to 1984-10)
  TEST: 60 months (1984-11 to 1989-10)
Model Performance:
    Mean Squared Error:  86307.46
    R-squared:  0.18

--- Split 6 ---
  TRAIN: 364 months (1959-07 to 1989-10)
  TEST: 60 months (1989-11 to 1994-10)
Model Performance:
    

In [25]:
pd.concat(test_results[-2:]).tail(10)

Unnamed: 0,date,actual_pce,predicted_pce,split
780,2024-01-01,19866.3,19418.757064,12
781,2024-02-01,19905.0,19469.602022,12
782,2024-03-01,20044.1,19743.945364,12
783,2024-04-01,20123.2,19694.947657,12
784,2024-05-01,20235.1,19801.432054,12
785,2024-06-01,20408.1,19865.349417,12
786,2024-07-01,20389.0,19995.554452,12
787,2024-08-01,20469.3,20029.278286,12
788,2024-09-01,20621.8,20206.718241,12
789,2024-10-01,20669.5,20252.295143,12


Linear regression had decent results in most of the splits with a high r2 value

In [29]:
mse_scores = []
r2_scores = []
test_results = []

print("*** XGBoost Regression Model ***")

for i, (train_index, test_index) in enumerate(time_series_split.split(X)):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_dates_min = data_with_lag.iloc[train_index]['date'].min().strftime('%Y-%m')
    train_dates_max = data_with_lag.iloc[train_index]['date'].max().strftime('%Y-%m')
    test_dates_min = data_with_lag.iloc[test_index]['date'].min().strftime('%Y-%m')
    test_dates_max = data_with_lag.iloc[test_index]['date'].max().strftime('%Y-%m')

    print(f"\n--- Split {i + 1} ---")
    print(f"  TRAIN: {len(train_index)} months ({train_dates_min} to {train_dates_max})")
    print(f"  TEST: {len(test_index)} months ({test_dates_min} to {test_dates_max})")


    model = xgb.XGBRegressor(
        objective = 'reg:squarederror', 
        n_estimators = 100, 
        learning_rate = 0.1, 
        max_depth = 6, 
        random_state = 42
    ) 


    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)


    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mse_scores.append(mse)
    r2_scores.append(r2)

    split_tests = pd.DataFrame({
        'date': test_dates,
        'actual_pce': y_test.values,
        'predicted_pce': y_pred,
        'split': i + 1
    })
    test_results.append(split_tests)


    print("Model Performance:")
    print(f"    Mean Squared Error: {mse : .2f}")
    print(f"    R-squared: {r2 : .2f}")

*** XGBoost Regression Model ***

--- Split 1 ---
  TRAIN: 64 months (1959-07 to 1964-10)
  TEST: 60 months (1964-11 to 1969-10)
Model Performance:
    Mean Squared Error:  12503.04
    R-squared: -2.52

--- Split 2 ---
  TRAIN: 124 months (1959-07 to 1969-10)
  TEST: 60 months (1969-11 to 1974-10)
Model Performance:
    Mean Squared Error:  39586.77
    R-squared: -2.39

--- Split 3 ---
  TRAIN: 184 months (1959-07 to 1974-10)
  TEST: 60 months (1974-11 to 1979-10)
Model Performance:
    Mean Squared Error:  162397.47
    R-squared: -2.78

--- Split 4 ---
  TRAIN: 244 months (1959-07 to 1979-10)
  TEST: 60 months (1979-11 to 1984-10)
Model Performance:
    Mean Squared Error:  300809.94
    R-squared: -3.02

--- Split 5 ---
  TRAIN: 304 months (1959-07 to 1984-10)
  TEST: 60 months (1984-11 to 1989-10)
Model Performance:
    Mean Squared Error:  397969.21
    R-squared: -2.78

--- Split 6 ---
  TRAIN: 364 months (1959-07 to 1989-10)
  TEST: 60 months (1989-11 to 1994-10)
Model Perform

In [30]:
pd.concat(test_results[-2:]).tail(10)

Unnamed: 0,date,actual_pce,predicted_pce,split
780,2024-01-01,19866.3,13209.560547,12
781,2024-02-01,19905.0,13167.195312,12
782,2024-03-01,20044.1,13218.602539,12
783,2024-04-01,20123.2,13160.587891,12
784,2024-05-01,20235.1,13169.05957,12
785,2024-06-01,20408.1,13160.017578,12
786,2024-07-01,20389.0,13169.05957,12
787,2024-08-01,20469.3,13169.629883,12
788,2024-09-01,20621.8,13177.224609,12
789,2024-10-01,20669.5,13142.394531,12


Tried Xgboost Regressor for 3 and six months out r2 was negative both times and mean squared error was very high. Not going to use this one due to results. 