# Predictive modeling notebook

This notebook contains the modeling approach using topic distributions.

## Topic distributions and sentiment

In [87]:
from datetime import datetime

import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller, kpss
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chisquare

Import data and setup to match

In [3]:
inflation = pd.read_csv("../dataset/inflation_clean.csv")
unemp = pd.read_csv("../dataset/unemp_clean.csv")

In [4]:
inflation.rename(columns={"Unnamed: 0": "date"}, inplace=True)
unemp.rename(columns={"Unnamed: 0": "date"}, inplace=True)

In [5]:
unemp["date"] = [datetime.strptime(date_str, "%YM%m") for date_str in unemp["date"]]
inflation["date"] = [
    datetime.strptime(date_str, "%YM%m") for date_str in inflation["date"]
]

In [6]:
df_topic_q = pd.read_csv('../dataset/topic_q_downsampled.csv', index_col='date')
# df_topic_a = pd.read_csv('../dataset/topic_a_downsampled.csv')
df_sent = pd.read_csv('../dataset/sent_downsampled.csv', index_col='date')

In [109]:
df_sent_topic = pd.merge(df_topic_q.iloc[:,1:], df_sent['questions'], left_on='date', right_on='date')

### Stationarity

In [47]:
# Augmented Dickey-Fuller Test (ADF Test)/unit root test
def adf_test(ts, signif=0.05):
    dftest = adfuller(ts, autolag='AIC')
    adf = pd.Series(dftest[0:4], index=['Test Statistic','p-value','# Lags','# Observations'])
    for key,value in dftest[4].items():
       adf['Critical Value (%s)'%key] = value
    
    p = adf['p-value']
    if p > signif:
        print(f'Series is Non-Stationary')

# KPSS
def kpss_test(ts):
    kpsstest = kpss(ts, regression='c')
    kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
    if kpss_output['p-value'] > .05:
        print('Stationary')

In [88]:
def optimise_pls_cv(X, y, n_comp):
    # Define PLS object
    pls = PLSRegression(n_components=n_comp)

    # Cross-validation
    y_cv = cross_val_predict(pls, X, y, cv=10)

    # Calculate scores
    r2 = r2_score(y, y_cv)
    mse = mean_squared_error(y, y_cv)
    rpd = y.std()/np.sqrt(mse)
    
    return (y_cv, r2, mse, rpd)

# Prediction for unemployment

Stationarity measures

In [110]:
for col in df_sent_topic.columns:
    print(col)
    adf_test(df_sent_topic[col])

topic_1
Series is Non-Stationary
topic_2
Series is Non-Stationary
topic_3
Series is Non-Stationary
topic_4
Series is Non-Stationary
topic_5
questions
Series is Non-Stationary


In [111]:
for col in df_sent_topic.columns:
    print(col)
    kpss_test(df_sent_topic[col])

topic_1
topic_2
topic_3
topic_4
topic_5
questions


look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.



In [112]:
for col in df_sent_topic.columns:
    df_sent_topic[col] = df_sent_topic[col] - df_sent_topic[col].shift(1)

Match X and y to have same length and for X to correspond to y in next month

In [113]:
X_unemp = df_sent_topic[1:-4]
y_unemp = unemp[8:-2]
y_unemp['binary'] = [1 if x > 0 else 0 for x in y_unemp['Delta']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_unemp['binary'] = [1 if x > 0 else 0 for x in y_unemp['Delta']]


In [114]:
vif_data = pd.DataFrame()
vif_data["feature"] = X_unemp.columns
vif_data["VIF"] = [variance_inflation_factor(X_unemp.values, i) 
                   for i in range(len(X_unemp.columns))]

vif_data

Unnamed: 0,feature,VIF
0,topic_1,1.624407
1,topic_2,5.003451
2,topic_3,2.481857
3,topic_4,4.730326
4,topic_5,1.390541
5,questions,1.320334


In [89]:
pls_results = []
for n_comp in range(2, len(X_unemp.columns)):
    out = optimise_pls_cv(X_unemp, y_unemp['binary'], n_comp=n_comp)
    pls_results.append(out)

In [95]:
mse_list = [result[2] for result in pls_results]
r2_list = [result[1] for result in pls_results]

print(mse_list)
r2_list

[0.2654161670712718, 0.26727855838832215, 0.26751336421568395, 0.2674778116467241, 0.26742859345411585]


[-0.0644354811786172,
 -0.07190448888670864,
 -0.07284616345231365,
 -0.07270358202532123,
 -0.07250619544141501]

In [102]:
pls = PLSRegression(n_components=2)
X_red = pls.fit_transform(X_unemp, y_unemp['Delta'])

In [103]:
X_red

(array([[-2.61830142e+00, -2.77558691e-01],
        [ 2.70233651e+00, -1.09726531e+00],
        [-2.09442992e+00,  3.17392476e-01],
        [-1.68475142e+00,  6.83209874e-02],
        [-3.54784026e-01,  1.11134326e+00],
        [ 1.44637856e+00,  2.36221475e-01],
        [-1.43388937e+00, -1.18282206e+00],
        [ 2.59282404e+00,  1.20964665e+00],
        [-2.45806604e+00,  1.23891780e+00],
        [ 1.58931613e+00, -2.91642448e+00],
        [-1.81980080e-01,  2.09012790e+00],
        [-1.00366156e+00,  5.62461370e-01],
        [-5.82843612e-01, -1.08952948e+00],
        [ 2.28765636e+00, -2.09121206e-01],
        [-1.19432090e-01, -2.57926504e-02],
        [-2.16791600e+00, -8.58107740e-01],
        [ 1.60784471e+00,  5.40659731e-02],
        [-2.47059045e-01,  4.64610340e-01],
        [ 9.91864923e-01,  1.17695412e+00],
        [-1.36311807e+00, -9.09685131e-01],
        [ 1.00415259e+00,  3.40419801e-01],
        [ 6.36295593e-02, -3.49148951e-01],
        [-5.44534104e-02,  4.367

In [72]:
X_train = X_unemp.iloc[:137]
X_test = X_unemp.iloc[137:]
y_train = y_unemp['binary'].iloc[:137]
y_test = y_unemp['binary'].iloc[137:]

## Logistic Regression

In [73]:
log_mod = LogisticRegression()

log_mod.fit(X_train, y_train)

In [74]:
# Hyperparameter tuning
grid = {
    "penalty": ['none', "l2"],
    "solver": ['newton-cg'],
    'C': [100, 10, 1.0, 0.1, 0.01]
}

clf_unemp = GridSearchCV(log_mod, grid, verbose=1, n_jobs=2)

clf_unemp.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [75]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.54      0.25      0.34        28
           1       0.54      0.81      0.65        31

    accuracy                           0.54        59
   macro avg       0.54      0.53      0.50        59
weighted avg       0.54      0.54      0.50        59

0.5423728813559322


0.5254237288135594

## Support Vector Classifier

In [76]:
svc_mod = SVC()

svc_mod.fit(X_train, y_train)

In [78]:
# Hyperparameter tuning
# Start with kernel
grid = {
    # "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "kernel": ['sigmoid'],
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

clf_unemp = GridSearchCV(svc_mod, grid, verbose=1, n_jobs=-1)

clf_unemp.fit(X_train, y_train)

clf_unemp.best_estimator_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [79]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.54      0.25      0.34        28
           1       0.54      0.81      0.65        31

    accuracy                           0.54        59
   macro avg       0.54      0.53      0.50        59
weighted avg       0.54      0.54      0.50        59

0.5423728813559322


0.5254237288135594

## Random Forest Classifier

In [80]:
rfc_mod = RandomForestClassifier(random_state=0)

rfc_mod.fit(X_train, y_train)

In [81]:
# Hyperparameter tuning
grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)],
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

clf_unemp = GridSearchCV(rfc_mod, grid, verbose=1, n_jobs=4)

clf_unemp.fit(X_train, y_train)

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


  warn(


In [82]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)
print(accuracy_score(y_test, y_pred))

print(results)

y_test.mean()

0.5084745762711864
              precision    recall  f1-score   support

           0       0.46      0.21      0.29        28
           1       0.52      0.77      0.62        31

    accuracy                           0.51        59
   macro avg       0.49      0.49      0.46        59
weighted avg       0.49      0.51      0.47        59



0.5254237288135594

## XGBoost

In [85]:
xgb_mod = xgb.XGBClassifier(
    random_state=0, use_label_encoder=False, eval_metric="logloss",
    tree_method='gpu_hist'
)

xgb_mod.fit(X_train, y_train)

In [86]:
# Hyperparameter tuning
grid = {
    "eta": [0.1, 0.2, 0.3],
    "min_child_weight": [5, 10],
    "gamma": [0, 1.0, 10],
    "subsample": np.arange(0.5, 1, 0.1),
    "colsample_bytree": np.arange(0.5, 1, 0.1),
    "max_depth": np.arange(3, 10, 2),
    "scale_pos_weight": [0.5, 1, 2],
    "reg_alpha": [0, 1, 10.0, 100.0],
    "reg_lambda": [0, 1, 10.0, 100.0],
}

clf_unemp = GridSearchCV(xgb_mod, grid, verbose=1, n_jobs=6)

clf_unemp.fit(X_train, y_train)

Fitting 5 folds for each of 86400 candidates, totalling 432000 fits


KeyboardInterrupt: 

In [63]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)
print(accuracy_score(y_test, y_pred))

print(results)

y_test.mean()

              precision    recall  f1-score   support

           0       0.67      0.14      0.24        28
           1       0.56      0.94      0.70        32

    accuracy                           0.57        60
   macro avg       0.61      0.54      0.47        60
weighted avg       0.61      0.57      0.48        60



0.5333333333333333

## Unemployment chi-squared

Below we test whether the accuracy of the model above is beteter during times of high/low volatility.

In [None]:
y_correct = [pred == y_test[i] for i, pred in enumerate(y_pred)]

In [None]:
chisquare()

# Prediction for inflation

In [20]:
X_inflation = df_sent_topic[:-1]
y_inflation = inflation[7:]
y_inflation['binary'] = [1 if x > 0 else 0 for x in y_inflation['Delta']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_inflation['binary'] = [1 if x > 0 else 0 for x in y_inflation['Delta']]


In [24]:
X_train = X_inflation.iloc[:140, 1:]
X_test = X_inflation.iloc[140:, 1:]
y_train = y_inflation['binary'].iloc[:140]
y_test = y_inflation['binary'].iloc[140:]

## Logistic Regression

In [25]:
log_mod = LogisticRegression()

log_mod.fit(X_train, y_train)

In [27]:
# Hyperparameter tuning
grid = {
    "penalty": ['none', "l2"],
    "solver": ['newton-cg'],
    'C': [100, 10, 1.0, 0.1, 0.01]
}

clf_inf = RandomizedSearchCV(log_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_inf = GridSearchCV(log_mod, grid, verbose=1, n_jobs=-1)

clf_inf.fit(X_train, y_train)
clf_inf.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [28]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.73      1.00      0.85        44

    accuracy                           0.73        60
   macro avg       0.37      0.50      0.42        60
weighted avg       0.54      0.73      0.62        60

0.7333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.7333333333333333

## Support Vector Classifier

In [31]:
svc_mod = SVC()

svc_mod.fit(X_train, y_train)

In [33]:
# Hyperparameter tuning
# Start with kernel
grid = {
    "kernel": ['linear', 'poly', 'rbf', 'sigmoid']
}

clf_inf = GridSearchCV(svc_mod, grid, verbose=1, n_jobs=-1)

clf_inf.fit(X_train, y_train)
clf_inf.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [34]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.73      1.00      0.85        44

    accuracy                           0.73        60
   macro avg       0.37      0.50      0.42        60
weighted avg       0.54      0.73      0.62        60

0.7333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.7333333333333333

## Random Forest Classifier

In [35]:
rfc_mod = RandomForestClassifier(random_state=0)

rfc_mod.fit(X_train, y_train)

In [37]:
# Hyperparameter tuning
grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)],
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

clf_inf = GridSearchCV(rfc_mod, grid, verbose=1, n_jobs=4)

clf_inf.fit(X_train, y_train)

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


  warn(


In [38]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.73      0.98      0.83        44

    accuracy                           0.72        60
   macro avg       0.36      0.49      0.42        60
weighted avg       0.53      0.72      0.61        60

0.7166666666666667


0.7333333333333333

## XGBoost

In [65]:
xgb_mod = xgb.XGBClassifier(
    random_state=0, use_label_encoder=False, eval_metric="logloss"
)

xgb_mod.fit(X_train, y_train)

In [66]:
# Hyperparameter tuning
grid = {
    "eta": [0.1, 0.2, 0.3],
    "min_child_weight": [5, 10],
    "gamma": [0, 1.0, 10],
    "subsample": np.arange(0.5, 1, 0.1),
    "colsample_bytree": np.arange(0.5, 1, 0.1),
    "max_depth": np.arange(3, 10, 2),
    "scale_pos_weight": [0.5, 1, 2],
    "reg_alpha": [0, 1, 10.0, 100.0],
    "reg_lambda": [0, 1, 10.0, 100.0],
}

clf_inf = RandomizedSearchCV(xgb_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_inf = GridSearchCV(xgb_mod, grid, verbose=1, n_jobs=-1)

clf_inf.fit(X_train, y_train)

Fitting 5 folds for each of 5000 candidates, totalling 25000 fits


In [67]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.73      1.00      0.85        44

    accuracy                           0.73        60
   macro avg       0.37      0.50      0.42        60
weighted avg       0.54      0.73      0.62        60



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.7333333333333333

## Inflation chi-squared

Below we test whether the accuracy of the model above is beteter during times of high/low volatility.

In [None]:
y_correct = [pred == y_test[i] for i, pred in enumerate(y_pred)]

In [None]:
chisquare()