# Predictive modeling notebook

This notebook contains the modeling approach using topic distributions and sentiments as well as the deep learning approach using word embeddings.

## Topic distributions and sentiment

In [76]:
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [19]:
inflation = pd.read_csv("../dataset/inflation_clean.csv")
unemp = pd.read_csv("../dataset/unemp_clean.csv")

In [20]:
inflation.rename(columns={"Unnamed: 0": "date"}, inplace=True)
unemp.rename(columns={"Unnamed: 0": "date"}, inplace=True)

In [21]:
unemp["date"] = [datetime.strptime(date_str, "%YM%m") for date_str in unemp["date"]]
inflation["date"] = [
    datetime.strptime(date_str, "%YM%m") for date_str in inflation["date"]
]

## Deep learning approach

In [63]:
import pandas as pd
import xgboost as xgb

In [22]:
df_deep = pd.read_csv("../dataset/emb_down.csv")

Here we match the embeddings df to the respective outputs as we have different time scales for inflation and unemployment measures

In [45]:
unemp_x_deep = df_deep[:197]
kpi_x_deep = df_deep[:199]
unemp_y_deep = unemp[7:204]
kpi_y_deep = inflation[10:]

In [51]:
kpi_y_deep["bin_out"] = [1 if rate > 0 else 0 for rate in kpi_y_deep["Inflation"]]
unemp_y_deep["bin_out"] = [1 if rate > 0 else 0 for rate in unemp_y_deep["Delta"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kpi_y_deep['bin_out'] = [1 if rate > 0 else 0 for rate in kpi_y_deep['Inflation']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unemp_y_deep['bin_out'] = [1 if rate > 0 else 0 for rate in unemp_y_deep['Delta']]


### Prediction for unemployment

In [59]:
X_train = unemp_x_deep.iloc[:137, 1:]
X_test = unemp_x_deep.iloc[137:, 1:]
y_train = unemp_y_deep.iloc[:137]["bin_out"]
y_test = unemp_y_deep.iloc[137:]["bin_out"]

In [64]:
xgb_mod = xgb.XGBClassifier(
    random_state=0, use_label_encoder=False, eval_metric="logloss"
)

xgb_mod.fit(X_train, y_train)

In [86]:
# Hyperparameter tuning
grid = {
    "eta": [0.1, 0.2, 0.3],
    "min_child_weight": [5, 10],
    "gamma": [0, 1.0, 10],
    "subsample": np.arange(0.5, 1, 0.1),
    "colsample_bytree": np.arange(0.5, 1, 0.1),
    "max_depth": np.arange(3, 10, 2),
    "scale_pos_weight": [5, 10],
    "reg_alpha": [0, 1, 10.0, 100.0],
    "reg_lambda": [0, 1, 10.0, 100.0],
}

# clf = RandomizedSearchCV(xgb_mod, grid, verbose=1, n_iter=100000, n_jobs=-1)

clf = GridSearchCV(xgb_mod, grid, verbose=1, n_jobs=-1)

clf.fit(X_train, y_train)

Fitting 5 folds for each of 57600 candidates, totalling 288000 fits


In [87]:
y_pred = clf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

              precision    recall  f1-score   support

           0       0.67      0.14      0.24        28
           1       0.56      0.94      0.70        32

    accuracy                           0.57        60
   macro avg       0.61      0.54      0.47        60
weighted avg       0.61      0.57      0.48        60



In [69]:
y_pred = xgb_mod.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

y_test.mean()

              precision    recall  f1-score   support

           0       0.56      0.18      0.27        28
           1       0.55      0.88      0.67        32

    accuracy                           0.55        60
   macro avg       0.55      0.53      0.47        60
weighted avg       0.55      0.55      0.49        60



0.5333333333333333