# Plotly

Trying out Plotly using data from the St. Louis Fed, FRED API.

In [52]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import requests
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [53]:
api_key = '4447961bacaaa3cd54858e4994a7006e'

In [54]:
def fred_req(api_key:str, series_id:str):
    url = f'https://api.stlouisfed.org/fred/series/observations?series_id={series_id}&api_key={api_key}&file_type=json'
    r = requests.get(url)

    df = pd.DataFrame(r.json()['observations'])[['date', 'value']]

    # using dictionary to convert specific columns
    convert_dict = {'value': float} # this is useless here but thought it would be good for reference
    df = df.astype(convert_dict)
    df['date'] = pd.to_datetime(df['date'])

    return df

Firstly, let's just test it and look at a GDP trendline.

In [55]:
gdp_df = fred_req(api_key, 'GNPCA')

In [56]:
px.line(gdp_df, x="date", y="value", title="GDP Over Time", template="simple_white")

## Job Openings

Now I want to explore trends related to job openings. There are two sources available from the Fred API, Indeed and the Bureau of Labor Statistics (BLS).

### Indeed

Comparing all jobs to software development jobs.

In [57]:
software_dev_indeed_jobs_df = fred_req(api_key, 'IHLIDXUSTPSOFTDEVE')
software_dev_indeed_jobs_df['Job Category'] = 'Software Dev.'

In [58]:
all_indeed_jobs_df = fred_req(api_key, 'IHLIDXUS')
all_indeed_jobs_df['Job Category'] = 'All Jobs'

In [59]:
joined_df = pd.concat([software_dev_indeed_jobs_df, all_indeed_jobs_df])

In [60]:
px.line(joined_df, x="date", y="value", title="Inded Jobs Over Time", color="Job Category", template="simple_white")

## BLS Job Openings

For comparison, let's look at the BLS. There is a lot more data from this source.

In [61]:
all_jobs_df = fred_req(api_key, 'JTSJOL').assign(job_category="All Jobs")
info_jobs_df = fred_req(api_key, 'JTU5100JOL').assign(job_category="Info Jobs")
joined_df = pd.concat([all_jobs_df, info_jobs_df])

In [62]:
px.line(joined_df, x="date", y="value", title="BLS Jobs Over Time", color="job_category", template="simple_white")

Obviously hard to compare. But we can standardize the values so that they have the same mean and variance.

### Standardizing Values

In [63]:
merge_df = pd.merge(all_jobs_df, info_jobs_df, on='date')[['date', 'value_x', 'value_y']]\
            .rename(columns={'value_x': 'All Jobs', 'value_y': 'Info Jobs'})
merge_df = merge_df.set_index('date')
ss = StandardScaler()
ss.fit(merge_df)
merge_df = pd.DataFrame(ss.transform(merge_df))
all_jobs_df['value'] = merge_df[0]
info_jobs_df['value'] = merge_df[1]

In [64]:
joined_df = pd.concat([all_jobs_df, info_jobs_df])

In [65]:
px.line(joined_df, x="date", y="value", title="Indeed Jobs Over Time", color="job_category", template="simple_white")

## Forecasting & Visualizing Forecasts

Now I want to make viable forecasts, initially just using time-series methods, and be able to effectively visualize them.

In [66]:
# create lag terms from the target value
def make_lags(ts, lags, lead_time=1):
    return pd.concat(
        {
            f'y_lag_{i}': ts.shift(i)
            for i in range(lead_time, lags + lead_time)
        },
        axis=1)

In [67]:
forecast_df = all_jobs_df.set_index('date')

In [68]:
y = forecast_df['value']
X = make_lags(forecast_df['value'], lags=4).fillna(0)
X0 = X

In [69]:
# create multi steps for the target variable
# i.e., next 8 iterations (assuming the index is sorted by date/time)
def make_multistep_target(ts, steps):
    '''
    Create multiple step target variables. Output is a 
    pandas dataframe of all the steps listed as their
    own columns.

    Parameters:
        - ts (pd.Series): Target Series
        - steps (int): Number of Steps
    
    Returns:
        - pd.DataFrame
    '''
    return pd.concat(
        {f'y_step_{i + 1}': ts.shift(-i)
         for i in range(steps)},
         axis=1
    )

In [70]:
# Eight-week forecast
y = make_multistep_target(y, steps=8).dropna()

# Shifting has created indexes that don't match. Only keep times for
# which we have both targets and features.
y, X = y. align(X, join="inner", axis=0)

In [71]:
# Create splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

model = LinearRegression()
model.fit(X_train, y_train)

y_fit = pd.DataFrame(model.predict(X_train), index=X_train.index, columns=y.columns)
y_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)

In [72]:
y_pred1_dates = np.array(y_pred.index[:8])
y_pred1 = np.array(y_pred.loc['2017-09-01'])

pred1_df = pd.DataFrame(list(zip(y_pred1_dates, y_pred1)), columns=['date', 'value'])
pred1_df.set_index('date', inplace=True)

In [73]:
import dateutil

In [74]:
view_df = forecast_df.loc['2015-01-01':pd.DataFrame(y_pred1_dates)[0][0] - dateutil.relativedelta.relativedelta(months=1)]
compare_df = forecast_df.loc[pd.DataFrame(y_pred1_dates)[0][0] - dateutil.relativedelta.relativedelta(months=1):pd.DataFrame(y_pred1_dates)[0][7]]

In [75]:
import plotly.graph_objects as go 

def forecast_plot(view_df, pred_df, compare_df):

    # Create traces
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=np.array(view_df.index), y=np.array(view_df.value),
                             mode='lines',
                             name='history'))
    fig.add_trace(go.Scatter(x=np.array(pred_df.index), y=np.array(pred_df.value),
                             mode='lines',
                             name='predictions'))
    fig.add_trace(go.Scatter(x=np.array(compare_df.index), y=np.array(compare_df.value),
                             mode='lines',
                             name='actual'))
    
    # Edit the layout
    fig.update_layout(title='Testing the Jobs Index Forecast',
                      xaxis_title='Month',
                      yaxis_title='Job Index',
                      template='simple_white')
    
    fig.show()

In [76]:
forecast_plot(view_df, pred1_df, compare_df)

### Live Forecast

Now since we can see how it performs, let's look at what the forecast for today is.

In [78]:
X0.tail(1)

Unnamed: 0_level_0,y_lag_1,y_lag_2,y_lag_3,y_lag_4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-10-01,1.764691,1.82855,1.577893,1.684325


In [79]:
from datetime import date
from dateutil.relativedelta import relativedelta

def forecast_transform(x_values, row_num: int):

    forecasting_values = model.predict(np.array(X0.iloc[-row_num]).reshape(1, -1))[0]
    forecasting_dates = [X0.iloc[[-row_num]].index[0] + relativedelta(months=i) for i in range(1,9)]
    forecasting_df = pd.DataFrame(list(zip(forecasting_dates, forecasting_values)), columns=['date', 'value'])
    forecasting_df.set_index('date', inplace=True)

    return forecasting_df

In [82]:
all_forecast = pd.DataFrame(columns=['value', 'forecast'])
all_forecast.index.names = ['date']
for i in range(1, 6):
    f_df = forecast_transform(X0, i)
    f_df['forecast'] = i
    all_forecast = pd.concat([all_forecast, f_df])

all_forecast.reset_index(inplace=True)


X does not have valid feature names, but LinearRegression was fitted with feature names


The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names



In [85]:
forecast_start = X0.tail(1).index[0] - relativedelta(months=1)
forecast_start = str(forecast_start.date())
view_df = forecast_df.loc['2015-01-01':]

In [93]:
fig = px.line(all_forecast[all_forecast['date']>=view_df.iloc[[-1]].index[0]],
              x='date',
              y='value',
              color='forecast',
              template='simple_white')

fig.add_trace(go.Scatter(x=np.array(view_df.index), y=np.array(view_df.value),
                         mode="lines",
                         name="history",
                         line_color="black"))

fig.add_vline(x=view_df.iloc[[-1]].index[0], line_width=1, line_dash="dash", line_color="green")
fig.update_layout(title="Forecasting Job Openings", xaxis_title="Date")
fig.update_layout(margin=dict(l=60, r=60, t=40, b=70), paper_bgcolor="floralwhite")

fig.show()