In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from utils.load_data import load_data
from utils.preprocessing import preprocess_data
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from postprocessing.arima import postprocess_arima
from sklearn.metrics import mean_squared_error

%reload_ext autoreload
%autoreload 2

TRENDS_FOLDER = 'data/google_trends/'
GDP_FOLDER = 'data/gdp/'
DATA_PREFIX = 'trends_data_by_topic_'

EPS = 1e-15
SEED = 42

## Loading and Preprocessing Data

### Loading Google Trends

In [2]:
data = load_data()
X, y, countries, y_mean, y_std = preprocess_data(data=data, epsilon=EPS)
X.shape, y.shape

((550, 102), (550,))

In [3]:
X.head()

Unnamed: 0,date,Expense_average,Research_and_development_average,Capital_expenditure_average,Business_average,Cost_average,Tax_average,Financial_capital_average,Investment_average,Gross_domestic_product_average,...,Artificial_intelligence_average,International_Financial_Reporting_Standards_average,Employment_average,country_Canada,country_Germany,country_Japan,country_Korea,country_Switzerland,country_United Kingdom,country_United States
2,-1.771126,2.782184,-2.41718,-1.69629,1.42637,-1.290762,1.327271,-0.604001,2.530535,-3.273765,...,-0.043561,-1.71091,0.900544,-0.415203,-0.415203,-0.365755,-0.412193,2.404079,-0.415203,-0.415203
1227,-1.771126,-0.446089,2.210188,-1.69629,0.516251,0.63696,0.313333,-0.604001,0.875986,3.790689,...,-0.336456,-1.71091,1.839678,-0.415203,-0.415203,-0.365755,2.421635,-0.415203,-0.415203,-0.415203
492,-1.771126,-1.009755,2.75105,-1.69629,1.718909,-1.037115,-0.773029,1.073548,2.594171,2.276877,...,-0.277877,0.745672,2.102636,-0.415203,-0.415203,-0.365755,-0.412193,-0.415203,2.404079,-0.415203
1472,-1.771126,-0.036149,3.231815,0.589877,1.6539,-0.682008,1.399695,0.082269,1.70326,0.763066,...,-0.453614,-0.598496,1.689417,-0.415203,-0.415203,-0.365755,-0.412193,-0.415203,-0.415203,2.404079
247,-1.771126,1.398639,0.707796,0.793092,1.231345,-1.59514,1.822169,-0.604001,1.957806,0.618893,...,-0.395035,0.65297,0.862979,-0.415203,2.404079,-0.365755,-0.412193,-0.415203,-0.415203,-0.415203


In [4]:
countries

2          Switzerland
1227             Korea
492     United Kingdom
1472     United States
247            Germany
             ...      
732     United Kingdom
487            Germany
242        Switzerland
1222            Canada
1712     United States
Name: country, Length: 550, dtype: object

## Simple Prediction Model

### We start with a simple regression model

In [5]:
percent_train = 0.85
number_train = int(len(X) * percent_train)

In [6]:
x_train = X.values[:number_train, :]
y_train = y.values[:number_train]
x_valid = X.values[number_train:, :]
y_valid = y.values[number_train:]
country_train = countries.values[:number_train]
country_valid = countries.values[number_train:]

# Add bias term
# x_train = np.hstack([x_train, np.ones((len(x_train), 1))])
# x_valid = np.hstack([x_valid, np.ones((len(x_valid), 1))])

In [7]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [8]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 100),
            nn.ReLU(),
            nn.Linear(100, 20),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

    def forward(self, x):
        return self.linear_relu_stack(x)
    
def train_nn(x_train, y_train, x_valid, y_valid, num_epochs=1000, learning_rate=1e-3, weight_decay=1e-5):
    num_features = x_train.shape[1]
    model = NeuralNetwork(num_features=num_features).to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    model.train()
    
    x_train_t = torch.tensor(x_train, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device).unsqueeze(1)
    x_valid_t = torch.tensor(x_valid, dtype=torch.float32).to(device)
    y_valid_t = torch.tensor(y_valid, dtype=torch.float32).to(device).unsqueeze(1)
    
    for t in tqdm(range(num_epochs)):
        model.train()
        y_pred = model(x_train_t)
        loss = loss_fn(y_pred, y_train_t)
        if t % 100 == 99:
            print(t, loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.eval()
    y_pred = model(x_valid_t)
    loss = loss_fn(y_pred, y_valid_t)
    print(f"Validation loss: {loss.item()}")
    
    return model

In [9]:
model = train_nn(x_train, y_train, x_valid, y_valid, num_epochs=10000, learning_rate=1e-3, weight_decay=1e-5)

  0%|          | 0/10000 [00:00<?, ?it/s]

99 0.004879059735685587
199 0.0014902094844728708
299 0.000643925741314888
399 0.00031768871122039855
499 0.0001701113214949146
599 0.00010166172432946041
699 6.786381709389389e-05
799 4.8634381528245285e-05
899 3.548289532773197e-05
999 2.593684257590212e-05
1099 1.9022547348868102e-05
1199 1.3981289157527499e-05
1299 1.0194282367592677e-05
1399 7.351657586696092e-06
1499 5.261676960799377e-06
1599 3.7446257010742556e-06
1699 2.6329369120503543e-06
1799 1.8268976873514475e-06
1899 1.2930614730066736e-06
1999 9.57142674451461e-07
2099 7.248416977745364e-07
2199 6.179837441777636e-07
2299 5.68006669254828e-07
2399 5.44290344350884e-07
2499 4.6197712322282314e-07
2599 3.660356355794647e-07
2699 1.833208216339699e-06
2799 5.497514052876795e-07
2899 2.3288937711640756e-07
2999 1.8870603923915041e-07
3099 6.999989636824466e-07
3199 2.239506500245625e-07
3299 3.672779030239326e-06
3399 4.027601221423538e-07
3499 2.678133341760258e-06
3599 1.502757669413768e-07
3699 4.4364520590534084e-07
379

In [10]:
# Get the predictions
x_valid = torch.tensor(x_valid, dtype=torch.float32).to(device)
x_train = torch.tensor(x_train, dtype=torch.float32).to(device)
y_pred = model(x_valid).clone().detach().cpu().numpy().squeeze()
y_pred_train = model(x_train).clone().detach().cpu().numpy().squeeze()

In [11]:
# Associate the result by country
y_pred_country = pd.DataFrame({'date': X['date'][number_train:], 'country': country_valid, 'y_pred': y_pred, 'y_true': y_valid})
y_pred_train_country = pd.DataFrame({'date': X['date'][:number_train], 'country': country_train, 'y_pred': y_pred_train, 'y_true': y_train})

In [12]:
# Apply arima on top of the predictions
# Apply the post-processing function
p = 3 # AR
d = 1 # I 
q = 3 # MA
adjusted_predictions = postprocess_arima(y_pred_train_country, y_pred_country, p, d, q)

# # Evaluate adjusted predictions
valid_adjusted = adjusted_predictions[adjusted_predictions['set'] == 'validation']
train_adjusted = adjusted_predictions[adjusted_predictions['set'] == 'train']

mse_valid_adjusted = mean_squared_error(valid_adjusted['y_true'], valid_adjusted['y_pred'])
print(f"Adjusted Validation MSE: {mse_valid_adjusted:.4f}")

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'


Adjusted Validation MSE: 0.3753




In [13]:
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

In [14]:
# Put together the train and the validation set
predictions = pd.concat([y_pred_country, y_pred_train_country])
predictions_adjusted = pd.concat([train_adjusted, valid_adjusted])

# Melting the dataframe for better plotting
predictions_melted = predictions.melt(
    id_vars=["date", "country"], value_vars=["y_pred", "y_true"], 
    var_name="Type", value_name="Value"
)

# Function to plot data for the selected country
def plot_by_country(selected_country):
    filtered_data = predictions_melted[predictions_melted["country"] == selected_country]
    cutoff_date = predictions['date'].quantile(percent_train)
    plt.figure(figsize=(12, 6))
    sns.lineplot(
        data=filtered_data,
        x="date", y="Value", hue="Type", style="Type", markers=True, dashes=False
    )
    plt.title(f"Prediction vs True Values for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label=f'Validation Start ({percent_train}%)')
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the plot function
interact(plot_by_country, selected_country=dropdown)

plt.show()

interactive(children=(Dropdown(description='Country:', options=('Canada', 'Japan', 'Switzerland', 'Korea', 'Un…