In [9]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import ipywidgets as widgets
from ipywidgets import interact
import seaborn as sns
from datetime import datetime

from utils.load_data import load_data, load_gt_data
from utils.preprocessing import Preprocessing
from models.MLP import MLP
from models.LinearModels import OLS, RidgeRegression
from models.KalmanFilterMLP import KalmanFilterMLP
import statsmodels.api as sm
import torch
import torch.nn as nn

from tqdm.notebook import tqdm

%reload_ext autoreload
%autoreload 2

TRENDS_FOLDER = 'data/google_trends/'
GDP_FOLDER = 'data/gdp/'
DATA_PREFIX = 'trends_data_by_topic_'

EPS = 1e-15
SEED = 42

## Loading and Preprocessing Data

### Loading Google Trends

In [10]:
TEST_ALL_GT_DATA = False # If set to True, the model will be trained on all available data and predictions will be made for all available GT data

TRAIN_PROPORTION = 0.90 if not TEST_ALL_GT_DATA else 1
PAST_GDPS = [] if not TEST_ALL_GT_DATA else None # e.g. range(1, 3) or [1, 2]
NB_PAST_GT = 1 if not TEST_ALL_GT_DATA else None # The number of past year-to-year Google Trends data to consider
MODE = "pct" # None | "pct" | "diff"
PERIOD = 4  # Year to year prediction

In [11]:
data, all_gdps, all_gts = load_data()

print(f"Data shape: {data.shape}")

preprocessor = Preprocessing(data=data, epsilon=EPS, mode=MODE, past_GDP_lags=PAST_GDPS, diff_period=PERIOD, all_GDPs=all_gdps, all_GTs=all_gts, nb_past_GT=NB_PAST_GT)
X_train, y_train, X_valid, y_valid = preprocessor.preprocess_data(train_pct=TRAIN_PROPORTION)

X_train[preprocessor.country_train == "Switzerland"].head()

Data shape: (550, 97)
X_train shape : (469, 101)
X_valid shape : (53, 101)
y_train shape : (469,)
y_valid shape : (53,)


Unnamed: 0,date,Expense_average,Research_and_development_average,Capital_expenditure_average,Business_average,Cost_average,Tax_average,Financial_capital_average,Investment_average,Gross_domestic_product_average,...,Semiconductor_average,Artificial_intelligence_average,International_Financial_Reporting_Standards_average,Employment_average,country_Germany,country_Japan,country_Korea,country_Switzerland,country_United Kingdom,country_United States
0,0,53.0,32.0,0.0,83.0,29.0,64.0,0.0,86.0,32.0,...,51.0,8.0,48.2,54.0,False,False,False,True,False,False
11,92,54.0,0.0,0.0,73.0,30.0,33.0,0.0,82.0,31.0,...,74.0,7.0,11.0,63.0,False,False,False,True,False,False
14,184,43.0,0.0,0.0,76.0,28.0,29.0,0.0,88.0,31.0,...,58.0,8.0,57.8,63.0,False,False,False,True,False,False
22,275,43.0,26.0,0.0,68.0,24.0,30.8,0.0,66.0,30.0,...,45.0,8.0,66.0,40.0,False,False,False,True,False,False
28,365,45.0,16.0,0.0,77.0,29.0,54.0,0.0,75.0,31.0,...,40.0,9.0,60.8,55.0,False,False,False,True,False,False


In [None]:
# Goal: Add the Year-over-Year Log-Difference of Search Volume Indices to the features

# 1. Get the Year-over-Year Log-Difference of Search Volume Indices
all_gts
search_terms = [col for col in all_gts.columns if col.endswith('_average')]
# Apply log
all_gts.groupby("country").apply()= np.log(all_gts.loc[all_gts["country"] == country, search_terms] + 1)

# 2. Add the previous year's difference to the features
for term in search_terms:
    all_gts.loc[all_gts["country"] == country, f"{term}_diff"] = all_gts.loc[all_gts["country"] == country, term].diff(3)

all_gts


Unnamed: 0,date,Expense_average,Research_and_development_average,Capital_expenditure_average,Business_average,Cost_average,Tax_average,Financial_capital_average,Investment_average,Gross_domestic_product_average,...,Agile_software_development_average,Subsidy_average,Sustainability_average,Open_innovation_average,Industrial_park_average,Semiconductor_average,Artificial_intelligence_average,International_Financial_Reporting_Standards_average,Employment_average,country
0,2004-01-01,0.000000,0.000000,0.000000,0.008250,0.008242,0.008245,0.000000,0.008249,0.008244,...,0.000000,0.008226,0.008246,0.000000,0.000000,0.008248,0.008233,0.008250,0.008248,Switzerland
1,2004-02-01,0.008248,0.000000,0.000000,0.008250,0.008243,0.008248,0.000000,0.008249,0.008244,...,0.000000,0.008227,0.008245,0.000000,0.000000,0.008248,0.008229,0.008249,0.008248,Switzerland
2,2004-03-01,0.008250,0.000000,0.000000,0.008250,0.008242,0.008248,0.000000,0.008250,0.000000,...,0.000000,0.000000,0.008245,0.000000,0.000000,0.008250,0.008233,0.000000,0.008248,Switzerland
3,2004-04-01,0.000000,0.000000,0.000000,0.008250,0.008242,0.008245,0.000000,0.008249,0.008246,...,0.000000,0.000000,0.008246,0.000000,0.000000,0.008249,0.008230,0.008250,0.008248,Switzerland
4,2004-05-01,0.008245,0.000000,0.000000,0.008249,0.008242,0.008244,0.000000,0.008249,0.008244,...,0.000000,0.000000,0.008245,0.000000,0.000000,0.008248,0.008229,0.000000,0.008247,Switzerland
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1710,2024-01-01,0.008249,0.008246,0.008248,0.008249,0.008250,0.008249,0.008205,0.008249,0.008248,...,0.008249,0.008250,0.008249,0.008240,0.008247,0.008240,0.008249,0.008241,0.008248,United States
1711,2024-02-01,0.008250,0.008247,0.008250,0.008249,0.008249,0.008250,0.008213,0.008249,0.008250,...,0.008249,0.008250,0.008250,0.008242,0.008247,0.008242,0.008250,0.008242,0.008245,United States
1712,2024-03-01,0.008250,0.008246,0.008249,0.008249,0.008250,0.008249,0.008213,0.008249,0.008249,...,0.008249,0.008249,0.008249,0.008245,0.008247,0.008242,0.008250,0.008241,0.008248,United States
1713,2024-04-01,0.008250,0.008247,0.008250,0.008249,0.008250,0.008249,0.008225,0.008249,0.008250,...,0.008249,0.008249,0.008250,0.008244,0.008248,0.008242,0.008250,0.008241,0.008249,United States


## Simple Prediction Model

### We start with a simple regression model

In [13]:
x_train = X_train.values
x_valid = X_valid.values
y_train = y_train.values
y_valid = y_valid.values
country_train = preprocessor.country_train
country_valid = preprocessor.country_valid

In [14]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [15]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 100),
            nn.LayerNorm(100),
            nn.ReLU(),
            nn.Linear(100, 20),
            nn.LayerNorm(20),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

    def forward(self, x):
        return self.linear_relu_stack(x)
    
def train_nn(x_train, y_train, x_valid, y_valid, num_epochs=1000, learning_rate=1e-3, weight_decay=1e-4):
    num_features = x_train.shape[1]
    model = NeuralNetwork(num_features=num_features).to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    model.train()
    
    x_train_t = torch.tensor(x_train, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device).unsqueeze(1)
    x_valid_t = torch.tensor(x_valid, dtype=torch.float32).to(device)
    y_valid_t = torch.tensor(y_valid, dtype=torch.float32).to(device).unsqueeze(1)

    training_loss = []
    validation_loss = []
    
    for t in tqdm(range(num_epochs)):
        model.train()
        y_pred = model(x_train_t)
        loss_train = loss_fn(y_pred, y_train_t)

        model.eval()
        loss_valid = loss_fn(model(x_valid_t), y_valid_t)
        model.train()

        training_loss.append(loss_train.item())
        validation_loss.append(loss_valid.item())

        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()
        
    model.eval()
    y_pred = model(x_valid_t)
    loss = loss_fn(y_pred, y_valid_t)
    print(f"Validation loss: {loss.item()}")
    
    return model, training_loss, validation_loss

In [16]:
model, training_loss, validation_loss = train_nn(x_train, y_train, x_valid, y_valid, num_epochs=100, learning_rate=1e-3, weight_decay=1e-2)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [None]:
plt.figure(figsize=(10, 3))
plt.plot(training_loss, label="Training loss")
plt.plot(validation_loss, label="Validation loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
# Get the predictions
x_valid = torch.tensor(x_valid, dtype=torch.float32).to(device)
x_train = torch.tensor(x_train, dtype=torch.float32).to(device)
y_pred = model(x_valid).clone().detach().cpu().numpy().squeeze()
y_pred_train = model(x_train).clone().detach().cpu().numpy().squeeze()

In [None]:
# Associate the result by country
y_pred_country = pd.DataFrame({'date': X_valid['date'], 'country': country_valid, 'y_pred': y_pred, 'y_true': y_valid})
y_pred_train_country = pd.DataFrame({'date': X_train['date'], 'country': country_train, 'y_pred': y_pred_train, 'y_true': y_train})
y_pred_train_country

In [None]:
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

In [None]:
# Put together the train and the validation set
predictions = pd.concat([y_pred_train_country, y_pred_country])

# Melting the dataframe for better plotting
predictions_melted = predictions.melt(
    id_vars=["date", "country"], value_vars=["y_pred", "y_true"], 
    var_name="Type", value_name="Value"
)

# Function to plot data for the selected country
def plot_by_country(selected_country):
    filtered_data = predictions_melted[predictions_melted["country"] == selected_country]
    cutoff_date = predictions['date'].quantile(TRAIN_PROPORTION)
    plt.figure(figsize=(12, 6))
    sns.lineplot(
        data=filtered_data,
        x="date", y="Value", hue="Type", style="Type", markers=True, dashes=False
    )
    plt.title(f"Prediction vs True Values for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label=f'Validation Start ({TRAIN_PROPORTION}%)')
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the plot function
interact(plot_by_country, selected_country=dropdown)

plt.show()