# Predictive modeling notebook

This notebook contains the modeling approach using topic distributions.

## Topic distributions and sentiment

In [340]:
from datetime import datetime

import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller, kpss
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chisquare
import pickle

Import data and setup to match

In [145]:
inflation = pd.read_csv("../dataset/inflation_clean.csv")
unemp = pd.read_csv("../dataset/unemp_clean.csv")

In [146]:
inflation.rename(columns={"Unnamed: 0": "date"}, inplace=True)
unemp.rename(columns={"Unnamed: 0": "date"}, inplace=True)

In [147]:
unemp["date"] = [datetime.strptime(date_str, "%YM%m") for date_str in unemp["date"]]
inflation["date"] = [
    datetime.strptime(date_str, "%YM%m") for date_str in inflation["date"]
]

In [148]:
df_topic_q = pd.read_csv('../dataset/topic_q_downsampled.csv', index_col='date')
df_topic_a = pd.read_csv('../dataset/topic_a_downsampled.csv', index_col='date')
df_sent = pd.read_csv('../dataset/sent_downsampled.csv', index_col='date')

In [312]:
df_sent_topic = pd.merge(df_topic_a.iloc[:,1:], df_sent['answers'], left_on='date', right_on='date')

In [344]:
df_sent_topic = pd.concat([df_topic_q.iloc[:,1:],
                           df_topic_a.iloc[:,1:],
                           df_sent[['questions', 'answers']]],
                           axis=1)

In [345]:
df_sent_topic.columns = ['topic_1_q', 'topic_2_q', 'topic_3_q', 'topic_4_q',
                         'topic_5_q', 'topic_1_a', 'topic_2_a', 'topic_3_a',
                         'topic_4_a', 'topic_5_a', 'sent_q', 'sent_a']

### Stationarity

In [189]:
# Augmented Dickey-Fuller Test (ADF Test)/unit root test
def adf_test(ts, signif=0.05):
    dftest = adfuller(ts, autolag='AIC')
    adf = pd.Series(dftest[0:4], index=['Test Statistic','p-value','# Lags','# Observations'])
    for key,value in dftest[4].items():
       adf['Critical Value (%s)'%key] = value
    
    p = adf['p-value']
    if p > signif:
        print(f'Series is Non-Stationary')

# KPSS
def kpss_test(ts):
    kpsstest = kpss(ts, regression='c')
    kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
    if kpss_output['p-value'] > .05:
        print('Stationary')

In [190]:
def optimise_pls_cv(X, y, n_comp):
    # Define PLS object
    pls = PLSRegression(n_components=n_comp)

    # Cross-validation
    y_cv = cross_val_predict(pls, X, y, cv=10)

    # Calculate scores
    r2 = r2_score(y, y_cv)
    mse = mean_squared_error(y, y_cv)
    rpd = y.std()/np.sqrt(mse)
    
    return (y_cv, r2, mse, rpd)

# Prediction for unemployment

Stationarity measures

In [195]:
for col in df_sent_topic.columns:
    print(col)
    adf_test(df_sent_topic[col])

topic_1_q
Series is Non-Stationary
topic_2_q
Series is Non-Stationary
topic_3_q
Series is Non-Stationary
topic_4_q
Series is Non-Stationary
topic_5_q
topic_1_a
Series is Non-Stationary
topic_2_a
topic_3_a
Series is Non-Stationary
topic_4_a
Series is Non-Stationary
topic_5_a
Series is Non-Stationary
sent_q
Series is Non-Stationary
sent_a


In [196]:
for col in df_sent_topic.columns:
    print(col)
    kpss_test(df_sent_topic[col])

topic_1_q
topic_2_q
topic_3_q
topic_4_q
topic_5_q
topic_1_a
topic_2_a
topic_3_a
topic_4_a
topic_5_a
sent_q
sent_a


look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.

look-up table. The actual p-value is smaller than the p-value returned.



In [346]:
for col in df_sent_topic.columns:
    df_sent_topic[col] = df_sent_topic[col] - df_sent_topic[col].shift(1)

Match X and y to have same length and for X to correspond to y in next month

In [347]:
X_unemp = df_sent_topic[1:-4]
y_unemp = unemp[8:-2]
y_unemp['binary'] = [1 if x > 0 else 0 for x in y_unemp['Delta']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_unemp['binary'] = [1 if x > 0 else 0 for x in y_unemp['Delta']]


In [348]:
vif_data = pd.DataFrame()
vif_data["feature"] = X_unemp.columns
vif_data["VIF"] = [variance_inflation_factor(X_unemp.values, i) 
                   for i in range(len(X_unemp.columns))]

vif_data

Unnamed: 0,feature,VIF
0,topic_1_q,2.662634
1,topic_2_q,5.977584
2,topic_3_q,4.090792
3,topic_4_q,6.262976
4,topic_5_q,2.005382
5,topic_1_a,5.977798
6,topic_2_a,2.728271
7,topic_3_a,2.509712
8,topic_4_a,10.611913
9,topic_5_a,2.390241


In [200]:
pls_results = []
for n_comp in range(2, len(X_unemp.columns)):
    out = optimise_pls_cv(X_unemp, y_unemp['binary'], n_comp=n_comp)
    pls_results.append(out)

In [201]:
mse_list = [result[2] for result in pls_results]
r2_list = [result[1] for result in pls_results]

print(mse_list)
r2_list

[0.27993267916387193, 0.28671523985824976, 0.28983211060177877, 0.2915458864307816, 0.2897360394514872, 0.2923585508988533, 0.29179669991878115, 0.2923456951232676, 0.29256968950664464, 0.29239055624089516]


[-0.1226530747217145,
 -0.14985412406248266,
 -0.16235414561832462,
 -0.1692271399023808,
 -0.16196885808208883,
 -0.1724862815878847,
 -0.17023301222255927,
 -0.17243472427763296,
 -0.1733330402011961,
 -0.17261463707591873]

In [349]:
X_train = X_unemp.iloc[:137]
X_test = X_unemp.iloc[137:]
y_train = y_unemp['binary'].iloc[:137]
y_test = y_unemp['binary'].iloc[137:]

In [350]:
pls = PLSRegression(n_components=2)
pls.fit(X_train, y_train)

In [351]:
X_train = pls.transform(X_train)
X_test = pls.transform(X_test)

## Logistic Regression

In [227]:
log_mod = LogisticRegression()

log_mod.fit(X_train, y_train)

In [229]:
# Hyperparameter tuning
grid = {
    # 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    "solver": ['newton-cg'],
    "penalty": ['none', "l2"],
    'C': [100, 10, 1.0, 0.1, 0.01]
}

clf_unemp = GridSearchCV(log_mod, grid, verbose=1, n_jobs=2)

clf_unemp.fit(X_train, y_train)
clf_unemp.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [230]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.59      0.36      0.44        28
           1       0.57      0.77      0.66        31

    accuracy                           0.58        59
   macro avg       0.58      0.57      0.55        59
weighted avg       0.58      0.58      0.56        59

0.576271186440678


0.5254237288135594

## Support Vector Classifier

In [231]:
svc_mod = SVC()

svc_mod.fit(X_train, y_train)

In [232]:
# Hyperparameter tuning
# Start with kernel
grid = {
    "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    # "kernel": ['poly'],
    # 'C': [0.1, 1, 10, 100, 1000],
    # 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

clf_unemp = GridSearchCV(svc_mod, grid, verbose=1, n_jobs=-1)

clf_unemp.fit(X_train, y_train)

clf_unemp.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [233]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.60      0.32      0.42        28
           1       0.57      0.81      0.67        31

    accuracy                           0.58        59
   macro avg       0.58      0.56      0.54        59
weighted avg       0.58      0.58      0.55        59

0.576271186440678


0.5254237288135594

## Random Forest Classifier

In [234]:
rfc_mod = RandomForestClassifier(random_state=0)

rfc_mod.fit(X_train, y_train)

In [235]:
# Hyperparameter tuning
grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)],
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

clf_unemp = GridSearchCV(rfc_mod, grid, verbose=1, n_jobs=4)

clf_unemp.fit(X_train, y_train)

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


  warn(


In [236]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)
print(accuracy_score(y_test, y_pred))

print(results)

y_test.mean()

0.5084745762711864
              precision    recall  f1-score   support

           0       0.47      0.29      0.36        28
           1       0.52      0.71      0.60        31

    accuracy                           0.51        59
   macro avg       0.50      0.50      0.48        59
weighted avg       0.50      0.51      0.49        59



0.5254237288135594

## XGBoost

In [352]:
xgb_mod = xgb.XGBClassifier(
    random_state=0, use_label_encoder=False, eval_metric="logloss",
    tree_method='gpu_hist', verbosity=2
)

xgb_mod.fit(X_train, y_train)

In [353]:
# Hyperparameter tuning
grid = {
    "eta": [0.1, 0.2, 0.3],
    "min_child_weight": [5, 10],
    "gamma": [0, 1.0, 10],
    "subsample": np.arange(0.5, 1, 0.1),
    "colsample_bytree": np.arange(0.5, 1, 0.1),
    "max_depth": np.arange(3, 10, 2),
    "scale_pos_weight": [0.5, 1, 2],
    "reg_alpha": [0, 1, 10.0, 100.0],
    "reg_lambda": [0, 1, 10.0, 100.0],
}

clf_unemp = GridSearchCV(xgb_mod, grid, verbose=2, n_jobs=-1)

model = clf_unemp.fit(X_train, y_train)

with open('../models/xgb_all_unemp.pkl', 'wb') as f:
    pickle.dump(model, f)

Fitting 5 folds for each of 86400 candidates, totalling 432000 fits


In [327]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)
print(accuracy_score(y_test, y_pred))

print(results)

y_test.mean()

0.5932203389830508
              precision    recall  f1-score   support

           0       0.70      0.25      0.37        28
           1       0.57      0.90      0.70        31

    accuracy                           0.59        59
   macro avg       0.64      0.58      0.53        59
weighted avg       0.63      0.59      0.54        59



0.5254237288135594

## Unemployment chi-squared

Below we test whether the accuracy of the model above is beteter during times of high/low volatility.

In [None]:
y_correct = [pred == y_test[i] for i, pred in enumerate(y_pred)]

In [None]:
chisquare()

# Prediction for inflation

In [328]:
X_inflation = df_sent_topic[1:-1]
y_inflation = inflation[8:]
y_inflation['binary'] = [1 if x > 0 else 0 for x in y_inflation['Delta']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_inflation['binary'] = [1 if x > 0 else 0 for x in y_inflation['Delta']]


In [329]:
vif_data = pd.DataFrame()
vif_data["feature"] = X_inflation.columns
vif_data["VIF"] = [variance_inflation_factor(X_inflation.values, i) 
                   for i in range(len(X_inflation.columns))]

vif_data

Unnamed: 0,feature,VIF
0,topic_1_q,2.652063
1,topic_2_q,5.979594
2,topic_3_q,4.112309
3,topic_4_q,6.241309
4,topic_5_q,2.007986
5,topic_1_a,5.980827
6,topic_2_a,2.724439
7,topic_3_a,2.515178
8,topic_4_a,10.63689
9,topic_5_a,2.375798


In [330]:
X_train = X_inflation.iloc[:139, 1:]
X_test = X_inflation.iloc[139:, 1:]
y_train = y_inflation['binary'].iloc[:139]
y_test = y_inflation['binary'].iloc[139:]

In [331]:
pls_results = []
for n_comp in range(2, len(X_inflation.columns)):
    out = optimise_pls_cv(X_inflation, y_inflation['binary'], n_comp=n_comp)
    pls_results.append(out)

mse_list = [result[2] for result in pls_results]
r2_list = [result[1] for result in pls_results]

print(mse_list)
r2_list

[0.24534421611983992, 0.24450759050087392, 0.24268133747499906, 0.24701886715022273, 0.2467190361916243, 0.24764726101532658, 0.24780280194493617, 0.24759626533294432, 0.24794650206469884, 0.248364232275628]


[-0.12452272020390986,
 -0.1206880892853135,
 -0.11231755154484246,
 -0.13219839791851506,
 -0.13082413798894854,
 -0.13507860919767922,
 -0.13579152312747889,
 -0.13484487308448267,
 -0.13645016530834964,
 -0.13836481045684557]

In [332]:
pls = PLSRegression(n_components=4)
pls.fit(X_train, y_train)

In [333]:
X_train = pls.transform(X_train)
X_test = pls.transform(X_test)

## Logistic Regression

In [247]:
log_mod = LogisticRegression()

log_mod.fit(X_train, y_train)

In [249]:
# Hyperparameter tuning
grid = {
    # 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    "solver": ['newton-cg'],
    "penalty": ['none', "l2"],
    'C': [100, 10, 1.0, 0.1, 0.01]
}

clf_inf = GridSearchCV(log_mod, grid, verbose=1, n_jobs=-1)

clf_inf.fit(X_train, y_train)
clf_inf.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [250]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.73      1.00      0.85        44

    accuracy                           0.73        60
   macro avg       0.37      0.50      0.42        60
weighted avg       0.54      0.73      0.62        60

0.7333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.7333333333333333

## Support Vector Classifier

In [251]:
svc_mod = SVC()

svc_mod.fit(X_train, y_train)

In [252]:
# Hyperparameter tuning
# Start with kernel
grid = {
    "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    # "kernel": ['poly'],
    # 'C': [0.1, 1, 10, 100, 1000],
    # 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

clf_inf = GridSearchCV(svc_mod, grid, verbose=1, n_jobs=-1)

clf_inf.fit(X_train, y_train)
clf_inf.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [253]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.73      1.00      0.85        44

    accuracy                           0.73        60
   macro avg       0.37      0.50      0.42        60
weighted avg       0.54      0.73      0.62        60

0.7333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.7333333333333333

## Random Forest Classifier

In [254]:
rfc_mod = RandomForestClassifier(random_state=0)

rfc_mod.fit(X_train, y_train)

In [255]:
# Hyperparameter tuning
grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)],
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

clf_inf = GridSearchCV(rfc_mod, grid, verbose=1, n_jobs=6)

clf_inf.fit(X_train, y_train)

Fitting 5 folds for each of 2160 candidates, totalling 10800 fits


  warn(


In [256]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.14      0.12      0.13        16
           1       0.70      0.73      0.71        44

    accuracy                           0.57        60
   macro avg       0.42      0.43      0.42        60
weighted avg       0.55      0.57      0.56        60

0.5666666666666667


0.7333333333333333

## XGBoost

In [334]:
xgb_mod = xgb.XGBClassifier(
    random_state=0, use_label_encoder=False, eval_metric="logloss",
    tree_method='gpu_hist'
)

xgb_mod.fit(X_train, y_train)

In [337]:
# Hyperparameter tuning
grid = {
    "eta": [0.1, 0.2, 0.3],
    "min_child_weight": [5, 10],
    "gamma": [0, 1.0, 10],
    "subsample": np.arange(0.5, 1, 0.1),
    "colsample_bytree": np.arange(0.5, 1, 0.1),
    "max_depth": np.arange(3, 10, 2),
    "scale_pos_weight": [0.5, 1, 2],
    "reg_alpha": [0, 1, 10.0, 100.0],
    "reg_lambda": [0, 1, 10.0, 100.0],
}

# clf_inf = GridSearchCV(xgb_mod, grid, verbose=1, n_jobs=6)

clf_inf = RandomizedSearchCV(xgb_mod, grid, verbose=1, n_jobs=4,
                             random_state=0, n_iter=1000)

model = clf_inf.fit(X_train, y_train)

with open('../models/xgb_all.pkl', 'wb') as f:
    pickle.dump(model, f)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


In [343]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)
print(accuracy_score(y_test, y_pred))

y_test.mean()

              precision    recall  f1-score   support

           0       0.25      0.06      0.10        16
           1       0.73      0.93      0.82        44

    accuracy                           0.70        60
   macro avg       0.49      0.50      0.46        60
weighted avg       0.60      0.70      0.63        60

0.7


0.7333333333333333

## Inflation chi-squared

Below we test whether the accuracy of the model above is beteter during times of high/low volatility.

In [None]:
y_correct = [pred == y_test[i] for i, pred in enumerate(y_pred)]

In [None]:
chisquare()