In [1]:
############
# Packages #
############
import os
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
from pathlib import Path
pio.renderers.default = "plotly_mimetype+notebook"

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler

from src.init_parameters import init_parameters
from src.vectorize import vectorize_data
from src.gibbs_sampler import gibbs_sampler_joint_post

In [2]:
root_path = Path(os.getcwd())
data_path = root_path.joinpath("Macro1.csv")
df = pd.read_csv(data_path)
df.drop('sasdate', axis=1, inplace=True)
df = df.iloc[1:,:]

df_y = df["INDPRO"]
df_x = df.drop('INDPRO', axis=1).copy()

TEST_SIZE = 0.3
df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(df_x, df_y, test_size=TEST_SIZE, shuffle=False)

#Drop columns with too many nan
col_to_drop = ['ACOGNO', 'TWEXMMTH', 'UMCSENTx', 'ANDENOx']
column_transformer = ColumnTransformer(
    transformers=[('drop_columns', 'drop', col_to_drop)],
    remainder='passthrough'
)
#Replace nan values by median for others
imputer = SimpleImputer(strategy='median')

#Normalize features
scaler = StandardScaler()

#Convert numpy arrays to Pandas DataFrame
to_df = FunctionTransformer(lambda x: pd.DataFrame(x, columns=df_x.columns.drop(col_to_drop)), validate=False)

pipeline_drop_imputer = Pipeline([
    ('preprocessor', column_transformer),
    ('imputer', imputer),
    ('scaler', scaler),
    ('to_dataframe', to_df)
])

df_x_train_transform = pipeline_drop_imputer.fit_transform(df_x_train)
df_x_test_transform = pipeline_drop_imputer.transform(df_x_test)

df_train_transform = df_x_train_transform.copy()
df_train_transform["INDPRO"] = df_y_train.values
df_test_transform = df_x_test_transform.copy()
df_test_transform["INDPRO"] = df_y_test.values

In [3]:
X_train = df_train_transform.values

In [4]:
X_train.shape

(469, 122)

## Init and vectorize

In [99]:
T = X_train.shape[0]
k = X_train.shape[1]
l = 0
a = 1
b = 1
A = 1
B = 1

dct = init_parameters(T, k, l, a, b, A, B, X_train)
data0 = vectorize_data(dct=dct,T=T,k=k,l=l,a=a,b=b,A=A,B=B) #Model vectorization

ValueError: could not broadcast input array from shape (122,) into shape (347,)