# Predictive modeling notebook

This notebook contains the modeling approach using topic distributions.

## Topic distributions and sentiment

In [51]:
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [52]:
inflation = pd.read_csv("../dataset/inflation_clean.csv")
unemp = pd.read_csv("../dataset/unemp_clean.csv")

In [53]:
inflation.rename(columns={"Unnamed: 0": "date"}, inplace=True)
unemp.rename(columns={"Unnamed: 0": "date"}, inplace=True)

In [54]:
unemp["date"] = [datetime.strptime(date_str, "%YM%m") for date_str in unemp["date"]]
inflation["date"] = [
    datetime.strptime(date_str, "%YM%m") for date_str in inflation["date"]
]

# Prediction for unemployment

In [None]:
X_data = pd.read_csv('../dataset/df_sent_topic.csv')

In [59]:
X_train = X_data.iloc[:137, 1:]
X_test = X_data.iloc[137:, 1:]
y_train = unemp_y_deep.iloc[:137]["bin_out"]
y_test = unemp_y_deep.iloc[137:]["bin_out"]

## Logistic Regression

In [None]:
log_mod = LogisticRegression()

log_mod.fit(X_train, y_train)

In [None]:
# Hyperparameter tuning
grid = {
    "penalty": ['none', "l1", "l2", 'elasticnet'],
    "solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C': [100, 10, 1.0, 0.1, 0.01]
}

clf_unemp = RandomizedSearchCV(log_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_unemp = GridSearchCV(log_mod, grid, verbose=1, n_jobs=-1)

clf_unemp.fit(X_train, y_train)

In [None]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

y_test.mean()

## Support Vector Classifier

In [None]:
svc_mod = SVC()

svc_mod.fit(X_train, y_train)

In [None]:
# Hyperparameter tuning
# Start with kernel
grid = {
    "kernels": ['linear', 'poly', 'rbf', 'sigmoid']
}

clf_unemp = RandomizedSearchCV(svc_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_unemp = GridSearchCV(xgb_mod, grid, verbose=1, n_jobs=-1)

clf_unemp.fit(X_train, y_train)

In [None]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

y_test.mean()

## Random Forest Classifier

In [None]:
rfc_mod = RandomForestClassifier()

rfc_mod.fit(X_train, y_train)

In [None]:
# Hyperparameter tuning
grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

clf_unemp = RandomizedSearchCV(rfc_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_unemp = GridSearchCV(rfc_mod, grid, verbose=1, n_jobs=-1)

clf_unemp.fit(X_train, y_train)

In [None]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

y_test.mean()

## XGBoost

In [60]:
xgb_mod = xgb.XGBClassifier(
    random_state=0, use_label_encoder=False, eval_metric="logloss"
)

xgb_mod.fit(X_train, y_train)

In [62]:
# Hyperparameter tuning
grid = {
    "eta": [0.1, 0.2, 0.3],
    "min_child_weight": [5, 10],
    "gamma": [0, 1.0, 10],
    "subsample": np.arange(0.5, 1, 0.1),
    "colsample_bytree": np.arange(0.5, 1, 0.1),
    "max_depth": np.arange(3, 10, 2),
    "scale_pos_weight": [5, 10],
    "reg_alpha": [0, 1, 10.0, 100.0],
    "reg_lambda": [0, 1, 10.0, 100.0],
}

clf_unemp = RandomizedSearchCV(xgb_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_unemp = GridSearchCV(xgb_mod, grid, verbose=1, n_jobs=-1)

clf_unemp.fit(X_train, y_train)

Fitting 5 folds for each of 5000 candidates, totalling 25000 fits


In [63]:
y_pred = clf_unemp.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

y_test.mean()

              precision    recall  f1-score   support

           0       0.67      0.14      0.24        28
           1       0.56      0.94      0.70        32

    accuracy                           0.57        60
   macro avg       0.61      0.54      0.47        60
weighted avg       0.61      0.57      0.48        60



0.5333333333333333

### Prediction for inflation

In [64]:
X_train = kpi_x_deep.iloc[:139, 1:]
X_test = kpi_x_deep.iloc[139:, 1:]
y_train = kpi_y_deep.iloc[:139]["bin_out"]
y_test = kpi_y_deep.iloc[139:]["bin_out"]

## Logistic Regression

In [None]:
log_mod = LogisticRegression()

log_mod.fit(X_train, y_train)

In [None]:
# Hyperparameter tuning
grid = {
    "penalty": ['none', "l1", "l2", 'elasticnet'],
    "solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C': [100, 10, 1.0, 0.1, 0.01]
}

clf_inf = RandomizedSearchCV(log_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_inf = GridSearchCV(log_mod, grid, verbose=1, n_jobs=-1)

clf_inf.fit(X_train, y_train)

In [None]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

y_test.mean()

## Support Vector Classifier

In [None]:
svc_mod = SVC()

svc_mod.fit(X_train, y_train)

In [None]:
# Hyperparameter tuning
# Start with kernel
grid = {
    "kernels": ['linear', 'poly', 'rbf', 'sigmoid']
}

clf_inf = RandomizedSearchCV(svc_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_inf = GridSearchCV(svc_mod, grid, verbose=1, n_jobs=-1)

clf_inf.fit(X_train, y_train)

In [None]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

y_test.mean()

## Random Forest Classifier

In [None]:
rfc_mod = RandomForestClassifier()

rfc_mod.fit(X_train, y_train)

In [None]:
# Hyperparameter tuning
grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

clf_inf = RandomizedSearchCV(rfc_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_inf = GridSearchCV(rfc_mod, grid, verbose=1, n_jobs=-1)

clf_inf.fit(X_train, y_train)

In [None]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

y_test.mean()

## XGBoost

In [65]:
xgb_mod = xgb.XGBClassifier(
    random_state=0, use_label_encoder=False, eval_metric="logloss"
)

xgb_mod.fit(X_train, y_train)

In [66]:
# Hyperparameter tuning
grid = {
    "eta": [0.1, 0.2, 0.3],
    "min_child_weight": [5, 10],
    "gamma": [0, 1.0, 10],
    "subsample": np.arange(0.5, 1, 0.1),
    "colsample_bytree": np.arange(0.5, 1, 0.1),
    "max_depth": np.arange(3, 10, 2),
    "scale_pos_weight": [5, 10],
    "reg_alpha": [0, 1, 10.0, 100.0],
    "reg_lambda": [0, 1, 10.0, 100.0],
}

clf_inf = RandomizedSearchCV(xgb_mod, grid, verbose=1, n_iter=5000, n_jobs=-1)

# clf_inf = GridSearchCV(xgb_mod, grid, verbose=1, n_jobs=-1)

clf_inf.fit(X_train, y_train)

Fitting 5 folds for each of 5000 candidates, totalling 25000 fits


In [67]:
y_pred = clf_inf.predict(X_test)

results = classification_report(y_test, y_pred)

print(results)

y_test.mean()

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.73      1.00      0.85        44

    accuracy                           0.73        60
   macro avg       0.37      0.50      0.42        60
weighted avg       0.54      0.73      0.62        60



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.7333333333333333