In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from utils.load_data import load_data
from utils.preprocessing import preprocess_data
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torchvision import datasets, transforms

%reload_ext autoreload
%autoreload 2

TRENDS_FOLDER = 'data/google_trends/'
GDP_FOLDER = 'data/gdp/'
DATA_PREFIX = 'trends_data_by_topic_'

EPS = 1e-15
SEED = 42

## Loading and Preprocessing Data

### Loading Google Trends

In [6]:
data = load_data()
X, y, countries, y_mean, y_std = preprocess_data(data=data, epsilon=EPS, past_gdp_lag=2)
X.shape, y.shape

((535, 103), (535,))

In [7]:
X.head()

Unnamed: 0,date,Expense_average,Research_and_development_average,Capital_expenditure_average,Business_average,Cost_average,Tax_average,Financial_capital_average,Investment_average,Gross_domestic_product_average,...,International_Financial_Reporting_Standards_average,Employment_average,GDP_lag_2,country_Canada,country_Germany,country_Japan,country_Korea,country_Switzerland,country_United Kingdom,country_United States
2354,-1.777718,0.220595,1.816296,-1.74574,1.506821,-1.67433,0.914534,-0.607975,1.958668,0.415364,...,2.256619,-0.347872,-0.577065,-0.412746,2.400284,-0.365,-0.412746,-0.415839,-0.415839,-0.415839
6652,-1.777718,-0.50186,3.412711,1.565297,1.605609,-0.911725,-0.716546,0.076861,1.181976,1.082468,...,-0.624113,1.712403,0.530527,-0.412746,-0.415839,-0.365,-0.412746,-0.415839,-0.415839,2.400284
3651,-1.777718,-0.450256,2.007866,-1.74574,0.749453,0.410125,0.672893,-0.607975,0.470008,-3.364891,...,-1.757516,1.063798,-0.601859,-0.412746,-0.415839,-0.365,2.418269,-0.415839,-0.415839,-0.415839
5898,-1.777718,-0.192237,-2.525952,-1.74574,1.276318,-1.013405,-0.776957,-0.607975,1.893943,-1.289457,...,2.436074,0.949338,1.135318,-0.412746,-0.415839,-0.365,-0.412746,2.400284,-0.415839,-0.415839
6179,-1.777718,-1.379126,2.518719,-1.74574,1.57268,-1.318448,-0.716546,-0.607975,1.635046,0.341241,...,1.548242,1.788709,-0.678939,-0.412746,-0.415839,-0.365,-0.412746,-0.415839,2.400284,-0.415839


In [8]:
countries

2354           Germany
6652     United States
3651             Korea
5898       Switzerland
6179    United Kingdom
             ...      
856             Canada
6413    United Kingdom
2588           Germany
6132       Switzerland
6886     United States
Name: country, Length: 535, dtype: object

## Simple Prediction Model

### We start with a simple regression model

In [9]:
percent_train = 0.9
number_train = int(len(X) * percent_train)

In [10]:
x_train = X.values[:number_train, :]
y_train = y.values[:number_train]
x_valid = X.values[number_train:, :]
y_valid = y.values[number_train:]
country_train = countries.values[:number_train]
country_valid = countries.values[number_train:]

# Add bias term
# x_train = np.hstack([x_train, np.ones((len(x_train), 1))])
# x_valid = np.hstack([x_valid, np.ones((len(x_valid), 1))])

In [11]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [12]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 100),
            nn.ReLU(),
            nn.Linear(100, 20),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

    def forward(self, x):
        return self.linear_relu_stack(x)
    
def train_nn(x_train, y_train, x_valid, y_valid, num_epochs=1000, learning_rate=1e-3, weight_decay=1e-5):
    num_features = x_train.shape[1]
    model = NeuralNetwork(num_features=num_features).to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    model.train()
    
    x_train_t = torch.tensor(x_train, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device).unsqueeze(1)
    x_valid_t = torch.tensor(x_valid, dtype=torch.float32).to(device)
    y_valid_t = torch.tensor(y_valid, dtype=torch.float32).to(device).unsqueeze(1)
    
    for t in tqdm(range(num_epochs)):
        model.train()
        y_pred = model(x_train_t)
        loss = loss_fn(y_pred, y_train_t)
        if t % 100 == 99:
            print(t, loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.eval()
    y_pred = model(x_valid_t)
    loss = loss_fn(y_pred, y_valid_t)
    print(f"Validation loss: {loss.item()}")
    
    return model

In [13]:
model = train_nn(x_train, y_train, x_valid, y_valid, num_epochs=10000, learning_rate=1e-3, weight_decay=1e-5)

  0%|          | 0/10000 [00:00<?, ?it/s]

99 0.12613436579704285
199 0.00875523965805769
299 0.0009907023049890995
399 0.00017933895287569612
499 9.896090341499075e-05
599 1.3380757081904449e-05
699 0.00019259970576968044
799 1.5471485426132858e-07
899 1.1883513252541889e-05
999 4.8397605922900766e-08
1099 2.2648846353945373e-08
1199 1.8200131535195396e-06
1299 2.2977317826189392e-08
1399 0.00018966911011375487
1499 2.567400869679659e-08
1599 0.0008868044824339449
1699 1.4337642539885564e-07
1799 2.4146082466813823e-08
1899 1.5018341400718782e-05
1999 5.662259283667481e-08
2099 1.2226807939441642e-06
2199 7.478836050722748e-05
2299 2.9869550388639254e-08
2399 8.751412678975612e-06
2499 1.7090936665908885e-08
2599 9.42268979997607e-06
2699 1.4596847286441061e-08
2799 5.262648846837692e-05
2899 1.0406719752609206e-07
2999 8.872009857441299e-06
3099 3.753273247752986e-08
3199 4.1936120396712795e-05
3299 2.1282005491229938e-08
3399 2.0465801497948632e-08
3499 1.2861983123002574e-06
3599 3.455550654507533e-07
3699 2.264532213303027

In [14]:
# Get the predictions
x_valid = torch.tensor(x_valid, dtype=torch.float32).to(device)
x_train = torch.tensor(x_train, dtype=torch.float32).to(device)
y_pred = model(x_valid).clone().detach().cpu().numpy().squeeze()
y_pred_train = model(x_train).clone().detach().cpu().numpy().squeeze()

In [15]:
# Associate the result by country
y_pred_country = pd.DataFrame({'date': X['date'][number_train:], 'country': country_valid, 'y_pred': y_pred, 'y_true': y_valid})
y_pred_train_country = pd.DataFrame({'date': X['date'][:number_train], 'country': country_train, 'y_pred': y_pred_train, 'y_true': y_train})
y_pred_train_country

Unnamed: 0,date,country,y_pred,y_true
2354,-1.777718,Germany,-0.547862,-0.547880
6652,-1.777718,United States,0.526914,0.526867
3651,-1.777718,Korea,-1.262289,-1.262328
5898,-1.777718,Switzerland,1.110933,1.111026
6179,-1.777718,United Kingdom,-0.544307,-0.544331
...,...,...,...,...
832,1.356203,Canada,0.327094,0.327120
3527,1.356203,Japan,-1.024120,-1.024111
3861,1.356203,Korea,0.294635,0.294633
2564,1.356203,Germany,1.111379,1.111416


In [19]:
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

In [20]:
# Put together the train and the validation set
predictions = pd.concat([y_pred_train_country, y_pred_country])

# Melting the dataframe for better plotting
predictions_melted = predictions.melt(
    id_vars=["date", "country"], value_vars=["y_pred", "y_true"], 
    var_name="Type", value_name="Value"
)

# Function to plot data for the selected country
def plot_by_country(selected_country):
    filtered_data = predictions_melted[predictions_melted["country"] == selected_country]
    cutoff_date = predictions['date'].quantile(percent_train)
    plt.figure(figsize=(12, 6))
    sns.lineplot(
        data=filtered_data,
        x="date", y="Value", hue="Type", style="Type", markers=True, dashes=False
    )
    plt.title(f"Prediction vs True Values for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label=f'Validation Start ({percent_train}%)')
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the plot function
interact(plot_by_country, selected_country=dropdown)

plt.show()

interactive(children=(Dropdown(description='Country:', options=('Germany', 'United States', 'Korea', 'Switzerl…