In [6]:
from datetime import datetime, timedelta, date, time
import pandas as pd
import numpy as np
import fredpy as fpew
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
from scipy import signal
import statsmodels.api as sm

In [20]:
df = pd.read_csv('/Users/wcrossb/Desktop/python/projects/personal/google-trends-LF-fcast/data/trend_data.csv',index_col='date',parse_dates=True)
kw_list = list(df.columns)

In [21]:
#prepare the input series (x) by detrending and standardizing
x = StandardScaler().fit_transform(signal.detrend(df))
#x = StandardScaler().fit_transform(df)


#Calculate the first two principal components & output explained variance
pca = PCA(n_components=2)
df_PC = pca.fit_transform(x)
print(pca.explained_variance_ratio_)


[0.6111988  0.37502643]


In [22]:
#add estimation data and result to a dataframe
df_est = pd.DataFrame(np.concatenate((df_PC, x), axis=1))

# collect and add column names to the estimation data
kw_list_std = [kw +"_std" for kw in kw_list]
colnames = ["PC1","PC2"]
colnames.extend(kw_list_std);
df_est.columns = colnames

#add back to original data, so we have the complete dataset
df_temp = df.reset_index(drop=False)
df_panel = pd.concat([df_temp, df_est.reset_index(drop=True)], axis =1)

df_panel['date'] = pd.to_datetime(df_panel['date'])


In [23]:
fig = px.line(df_panel, x="date", y=kw_list_std, title='title goes here')
fig.show()

In [24]:
fig = px.line(df_panel, x="date", y=["PC1","PC2"], title='title goes here')
fig.show()

In [25]:
import fredpy as fp
fp.api_key='6db754c085da6baa8ed79ed3764df3dc'

#retrieve values for real variables as well as target variable
cpi = fp.series('CPIAUCSL').pc()
u = fp.series('UNRATE').pc()
jobs = fp.series('PAYNSA').pc()
jobs_SA = fp.series('PAYEMS')


#concatenate into to DF and shorten variable names
fred_df = pd.DataFrame({'cpi':cpi.data,
                        #'u': u.data,
                        #'jobs': jobs.data,
                        'jobs_SA':jobs_SA.data})



In [26]:
fred_vars = fred_df.columns
print(fred_vars)
df = pd.merge(fred_df,df_panel, how = "inner", on="date")
fig = px.line(df, x="date", y=fred_vars, title='title goes here')
fig.show()

Index(['cpi', 'jobs_SA'], dtype='object')


In [32]:
period = 12
outlier_TH = 10
lag_length = 6

#df['pct_change_jobs'] = df['jobs_SA']
df['pc_jobs'] = df.jobs_SA.pct_change(periods=period)
df['pc_jobs_L1'] = df['pc_jobs'].shift(1)

df['pc_jobs_D1'] = df['pc_jobs'] - df['pc_jobs_L1']
df['pc_jobs_D1_L1'] = df['pc_jobs_D1'].shift(1)


outlier_2020=1
#outlier removal component
df.loc[df.pc_jobs_D1 > outlier_2020, 'pc_jobs_D1'] = outlier_2020
df.loc[df.pc_jobs_D1 < -outlier_2020, 'pc_jobs_D1'] = outlier_2020
df['pc_jobs_D1_scale'] = df['pc_jobs_D1']*300 +3


#*********************************
df['pct_change_PC1'] = df.PC1.pct_change(periods=period)
df['pct_change_PC2'] = df.PC2.pct_change(periods=period)


#correct the first component
df.loc[df.pct_change_PC1 > outlier_TH, 'pct_change_PC1'] = outlier_TH
df.loc[df.pct_change_PC1 < -outlier_TH, 'pct_change_PC1'] = outlier_TH

#correct the second component
df.loc[df.pct_change_PC2 > outlier_TH, 'pct_change_PC2'] = outlier_TH
df.loc[df.pct_change_PC2 < -outlier_TH, 'pct_change_PC2'] = outlier_TH



df['pct_change_PC1_test'] = df['pct_change_PC1'] +outlier_TH+1
df['pct_change_PC1_test'] = np.log(df['pct_change_PC1_test'])
df['pct_change_PC1_t_1L'] = df['pct_change_PC1_test'].shift(lag_length)


df['pct_change_PC2_test'] = df['pct_change_PC2'] +outlier_TH+1
df['pct_change_PC2_test'] = np.log(df['pct_change_PC2_test'])
df['pct_change_PC2_t_1L'] = df['pct_change_PC2_test'].shift(lag_length)


df['pct_change_PC1_L1'] = df['pct_change_PC1'].shift(1)
df['pct_change_PC1_L2'] = df['pct_change_PC1'].shift(2)
df['pct_change_PC1_L3'] = df['pct_change_PC1'].shift(3)
df['pct_change_PC1_L4'] = df['pct_change_PC1'].shift(4)

df['pct_change_PC2_L1'] = df['pct_change_PC2'].shift(1)



df_reg = df.iloc[2:,:]
start_date = '2005-01-01'
end_date = '2020-02-01'
mask = (df_reg['date'] > start_date) & (df_reg['date'] <= end_date)
df_reg = df_reg.loc[mask]


fig = px.line(df_reg, x="date", y=["pct_change_PC1_test","pct_change_PC2_test","pc_jobs_D1_scale"], title='title goes here')
fig.show()

fig = px.line(df_reg, x="date", y=["pc_jobs_D1","pc_jobs"], title='title goes here')
fig.show()


In [147]:
#basic_spec = ['pc_jobs_D1_L1','pct_change_PC1','pct_change_PC1_L1','pct_change_PC1_L2','pct_change_PC1_L3','pct_change_PC1_L4']
#basic_spec = ['pct_change_lag_jobs','pct_change_PC1','pct_change_PC1_L1', 'pct_change_PC2','pct_change_PC2_L1']
#basic_spec = ['pct_change_lag_jobs','pct_change_PC1','pct_change_PC2']
#basic_spec = ['pct_change_lag_jobs','PC1','PC2']

#basic_spec = ['pc_jobs_D1_L1','pct_change_PC1_test','pct_change_PC1_t_L']
basic_spec = ['pc_jobs_D1_L1','pct_change_PC1_test','pct_change_PC1_t_1L','pct_change_PC2_test','pct_change_PC2_t_1L']

#basic_spec = ['pc_jobs_D1_L1','pct_change_PC1_test']


y = df_reg['pc_jobs_D1']
x = df_reg[basic_spec]
reg = sm.OLS(y, sm.add_constant(x)).fit()

print(reg.summary())
df_reg['residual'] = reg.fittedvalues.values - y

                            OLS Regression Results                            
Dep. Variable:             pc_jobs_D1   R-squared:                       0.198
Model:                            OLS   Adj. R-squared:                  0.175
Method:                 Least Squares   F-statistic:                     8.652
Date:                Wed, 14 Sep 2022   Prob (F-statistic):           2.39e-07
Time:                        23:35:27   Log-Likelihood:                 1017.1
No. Observations:                 181   AIC:                            -2022.
Df Residuals:                     175   BIC:                            -2003.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.0028    

In [36]:
#THIS TRIES RANDOM FORESTS
features = df_reg

# Labels are the values we want to predict
labels = np.array(features['pc_jobs_D1'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('pc_jobs_D1', axis = 1)
# Saving feature names for later use
feature_list = [features.columns]
# Convert to numpy array
features = np.array(features)

In [37]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [38]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)


Training Features Shape: (135, 30)
Training Labels Shape: (135,)
Testing Features Shape: (46, 30)
Testing Labels Shape: (46,)


In [41]:


# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('average')]
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))


[Index(['date', 'cpi', 'jobs_SA', 'recession', 'unemployment',
       'unemployment benefits', 'job postings', 'inflation', 'PC1', 'PC2',
       'recession_std', 'unemployment_std', 'unemployment benefits_std',
       'job postings_std', 'inflation_std', 'pc_jobs', 'pc_jobs_L1',
       'pc_jobs_D1_L1', 'pc_jobs_D1_scale', 'pct_change_PC1', 'pct_change_PC2',
       'pct_change_PC1_test', 'pct_change_PC1_t_1L', 'pct_change_PC2_test',
       'pct_change_PC2_t_1L', 'pct_change_PC1_L1', 'pct_change_PC1_L2',
       'pct_change_PC1_L3', 'pct_change_PC1_L4', 'pct_change_PC2_L1'],
      dtype='object')]


AttributeError: 'list' object has no attribute 'average'

In [None]:
basic_spec = ['pct_change_lag_jobs']
y =df_reg['pct_change_jobs']
x = df_reg[basic_spec]
reg = sm.OLS(y, sm.add_constant(x)).fit()

print(reg.summary())

In [None]:
pytrends = TrendReq(hl='en-US', timeout=(10,25), retries=2, backoff_factor=1, requests_args={'verify':True})

kw_list = ["unemployment", "job listings", "loan", "recession"]

pytrends.build_payload(kw_list, cat=0, timeframe='all', geo='US', gprop='')

df = pytrends.interest_over_time()
df = df.drop('isPartial', axis=1)