In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm import tqdm
from utils.load_data import load_data
from utils.preprocessing import preprocess_data
from sklearn.model_selection import train_test_split

%reload_ext autoreload
%autoreload 2

TRENDS_FOLDER = 'data/google_trends/'
GDP_FOLDER = 'data/gdp/'
DATA_PREFIX = 'trends_data_by_topic_'

EPS = 1e-15
SEED = 42

## Loading and Preprocessing Data

### Loading Google Trends

In [2]:
data = load_data()
X, y, countries, y_mean, y_std = preprocess_data(data=data, epsilon=EPS)
X.shape, y.shape

((550, 102), (550,))

In [3]:
X.head()

Unnamed: 0,date,Expense_average,Research_and_development_average,Capital_expenditure_average,Business_average,Cost_average,Tax_average,Financial_capital_average,Investment_average,Gross_domestic_product_average,...,Artificial_intelligence_average,International_Financial_Reporting_Standards_average,Employment_average,country_Canada,country_Germany,country_Japan,country_Korea,country_Switzerland,country_United Kingdom,country_United States
616,-1.771126,-1.060998,2.871241,-1.69629,1.582391,-0.732737,-0.290201,-0.604001,-0.078562,0.186376,...,-0.395035,-1.71091,1.689417,2.404079,-0.415203,-0.365755,-0.412193,-0.415203,-0.415203,-0.415203
6646,-1.771126,-0.036149,3.231815,0.589877,1.6539,-0.682008,1.399695,0.082269,1.70326,0.763066,...,-0.453614,-0.598496,1.689417,-0.415203,-0.415203,-0.365755,-0.412193,-0.415203,-0.415203,2.404079
6173,-1.771126,-1.009755,2.75105,-1.69629,1.718909,-1.037115,-0.773029,1.073548,2.594171,2.276877,...,-0.277877,0.745672,2.102636,-0.415203,-0.415203,-0.365755,-0.412193,-0.415203,2.404079,-0.415203
3645,-1.771126,-0.446089,2.210188,-1.69629,0.516251,0.63696,0.313333,-0.604001,0.875986,3.790689,...,-0.336456,-1.71091,1.839678,-0.415203,-0.415203,-0.365755,2.421635,-0.415203,-0.415203,-0.415203
5892,-1.771126,2.782184,-2.41718,-1.69629,1.42637,-1.290762,1.327271,-0.604001,2.530535,-3.273765,...,-0.043561,-1.71091,0.900544,-0.415203,-0.415203,-0.365755,-0.412193,2.404079,-0.415203,-0.415203


In [13]:
countries

616             Canada
6646     United States
6173    United Kingdom
3645             Korea
5892       Switzerland
             ...      
856             Canada
6413    United Kingdom
2588           Germany
6132       Switzerland
6886     United States
Name: country, Length: 550, dtype: object

## Simple Prediction Model

### We start with a simple regression model

In [6]:
X

Unnamed: 0,date,Expense_average,Research_and_development_average,Capital_expenditure_average,Business_average,Cost_average,Tax_average,Financial_capital_average,Investment_average,Gross_domestic_product_average,...,Artificial_intelligence_average,International_Financial_Reporting_Standards_average,Employment_average,country_Canada,country_Germany,country_Japan,country_Korea,country_Switzerland,country_United Kingdom,country_United States
616,-1.771126,-1.060998,2.871241,-1.696290,1.582391,-0.732737,-0.290201,-0.604001,-0.078562,0.186376,...,-0.395035,-1.710910,1.689417,2.404079,-0.415203,-0.365755,-0.412193,-0.415203,-0.415203,-0.415203
6646,-1.771126,-0.036149,3.231815,0.589877,1.653900,-0.682008,1.399695,0.082269,1.703260,0.763066,...,-0.453614,-0.598496,1.689417,-0.415203,-0.415203,-0.365755,-0.412193,-0.415203,-0.415203,2.404079
6173,-1.771126,-1.009755,2.751050,-1.696290,1.718909,-1.037115,-0.773029,1.073548,2.594171,2.276877,...,-0.277877,0.745672,2.102636,-0.415203,-0.415203,-0.365755,-0.412193,-0.415203,2.404079,-0.415203
3645,-1.771126,-0.446089,2.210188,-1.696290,0.516251,0.636960,0.313333,-0.604001,0.875986,3.790689,...,-0.336456,-1.710910,1.839678,-0.415203,-0.415203,-0.365755,2.421635,-0.415203,-0.415203,-0.415203
5892,-1.771126,2.782184,-2.417180,-1.696290,1.426370,-1.290762,1.327271,-0.604001,2.530535,-3.273765,...,-0.043561,-1.710910,0.900544,-0.415203,-0.415203,-0.365755,-0.412193,2.404079,-0.415203,-0.415203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,1.715275,2.116032,-0.361907,1.555148,1.114329,2.006658,3.149945,-0.222740,1.130532,1.700187,...,5.111383,0.421217,0.862979,2.404079,-0.415203,-0.365755,-0.412193,-0.415203,-0.415203,-0.415203
6413,1.715275,-0.138634,1.248657,0.132644,0.776285,1.093526,2.727471,-0.451496,1.575987,3.574430,...,4.759909,-0.274042,1.802113,-0.415203,-0.415203,-0.365755,-0.412193,-0.415203,2.404079,-0.415203
2588,1.715275,0.783730,-0.313831,1.605952,0.256216,2.311035,2.425703,-0.604001,1.639624,0.979325,...,4.818488,-0.459444,-0.865028,-0.415203,2.404079,-0.365755,-0.412193,-0.415203,-0.415203,-0.415203
6132,1.715275,1.808578,-1.395553,1.656756,0.841293,2.361764,3.270651,-0.604001,2.530535,1.988532,...,5.111383,-0.181340,1.464025,-0.415203,-0.415203,-0.365755,-0.412193,2.404079,-0.415203,-0.415203


In [38]:
number_train = int(len(X) * 0.8)

In [39]:
len(X) * 0.85

467.5

In [40]:
number_train

440

In [41]:
x_train = X.values[:number_train, :]
y_train = y.values[:number_train]
x_valid = X.values[number_train:, :]
y_valid = y.values[number_train:]
country_train = countries.values[:number_train]
country_valid = countries.values[number_train:]

# Add bias term
x_train = np.hstack([x_train, np.ones((len(x_train), 1))])
x_valid = np.hstack([x_valid, np.ones((len(x_valid), 1))])

In [42]:
weights = np.linalg.lstsq(x_train, y_train, rcond=1e-15)[0]
y_pred = weights @ x_valid.T
y_pred_train = weights @ x_train.T
np.linalg.norm(y_pred - y_valid) / y_valid.size

0.0615546538918305

In [43]:
# Associate the result by country
y_pred_country = pd.DataFrame({'date': X['date'][number_train:], 'country': country_valid, 'y_pred': y_pred, 'y_true': y_valid})
y_pred_train_country = pd.DataFrame({'date': X['date'][:number_train], 'country': country_train, 'y_pred': y_pred_train, 'y_true': y_train})
y_pred_train_country

Unnamed: 0,date,country,y_pred,y_true
616,-1.771126,Canada,-1.134109,-1.110615
6646,-1.771126,United States,-0.485772,-0.539413
6173,-1.771126,United Kingdom,-1.162421,-1.228650
3645,-1.771126,Korea,-1.808700,-1.837559
5892,-1.771126,Switzerland,-0.651031,-0.636078
...,...,...,...,...
2540,1.017995,Germany,0.900025,0.874226
808,1.017995,Canada,0.324388,0.135444
6084,1.017995,Switzerland,1.849809,1.924083
6365,1.017995,United Kingdom,0.325570,0.247836


In [44]:
import seaborn as sns

In [None]:
# Put together the train and the validation set
predictions = pd.concat([y_pred_train_country, y_pred_country])

# Melting the dataframe for better plotting
predictions_melted = predictions.melt(
    id_vars=["date", "country"], value_vars=["y_pred", "y_true"], 
    var_name="Type", value_name="Value"
)

import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

# Function to plot data for the selected country
def plot_by_country(selected_country):
    filtered_data = predictions_melted[predictions_melted["country"] == selected_country]
    cutoff_date = predictions['date'].quantile(0.85)
    plt.figure(figsize=(12, 6))
    sns.lineplot(
        data=filtered_data,
        x="date", y="Value", hue="Type", style="Type", markers=True, dashes=False
    )
    plt.title(f"Prediction vs True Values for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label='Validation Start (85%)')
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the plot function
interact(plot_by_country, selected_country=dropdown)


interactive(children=(Dropdown(description='Country:', options=('Canada', 'United States', 'United Kingdom', '…

<function __main__.plot_by_country(selected_country)>

## Trying Gaussian Process

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the kernel with hyperparameter bounds
kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) + WhiteKernel()

# Create the GaussianProcessRegressor
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)

# Define parameter grid for cross-validation
param_grid = {
    "kernel__k1__k1__constant_value": [0.1, 1, 10, 100],
    "kernel__k1__k1__constant_value_bounds": [(1e-3, 1e3)],
    "kernel__k1__k2__length_scale": [0.1, 1, 10, 100],
    "kernel__k1__k2__length_scale_bounds": [(1e-3, 1e3)],
    "alpha": [1e-2, 1e-1, 1],
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(gp, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Best parameters
print("Best parameters found:")
print(grid_search.best_params_)

Best parameters found:
{'alpha': 0.01, 'kernel__k1__k1__constant_value': 0.1, 'kernel__k1__k1__constant_value_bounds': (0.001, 1000.0), 'kernel__k1__k2__length_scale': 0.1, 'kernel__k1__k2__length_scale_bounds': (0.001, 1000.0)}




ValueError: too many values to unpack (expected 2)

In [56]:
gp.set_params(**grid_search.best_params_)
gp.fit(x_train, y_train)

# Predict on the validation set
y_pred, sigma = gp.predict(x_valid, return_std=True)
y_pred_train, sigma_train = gp.predict(x_train, return_std=True)



In [63]:
# Calculate Mean Squared Error
mse_train = mean_squared_error(y_train, y_pred_train)
mse_valid = mean_squared_error(y_valid, y_pred)
print(f"Training MSE: {mse_train:.4f}")
print(f"Validation MSE: {mse_valid:.4f}")

Training MSE: 0.0030
Validation MSE: 0.3402


In [70]:
# Associate the result by country
y_pred_country = pd.DataFrame({'date': X['date'][number_train:], 'country': country_valid, 'y_pred': y_pred, 'y_true': y_valid, 'y_std': sigma})
y_pred_train_country = pd.DataFrame({'date': X['date'][:number_train], 'country': country_train, 'y_pred': y_pred_train, 'y_true': y_train, 'y_std': sigma_train})
y_pred_train_country

# Put together the train and the validation set
predictions = pd.concat([y_pred_train_country, y_pred_country])

# Melting the dataframe for better plotting
predictions_melted = predictions.melt(
    id_vars=["date", "country"], value_vars=["y_pred", "y_true"], 
    var_name="Type", value_name="Value"
)


In [74]:
# Function to plot data with confidence intervals for the selected country
def plot_by_country_with_confidence(selected_country):
    filtered_data = predictions_melted[predictions_melted["country"] == selected_country]
    cutoff_date = predictions['date'].quantile(0.85)


    unmelted_data = predictions[(predictions["country"] == selected_country)]
    
    plt.figure(figsize=(12, 6))
    
    # Plot predictions and true values
    sns.lineplot(
        data=filtered_data,
        x="date", y="Value", hue="Type", style="Type", markers=True, dashes=False
    )
    
    plt.fill_between(
        unmelted_data["date"],
        unmelted_data["y_pred"] - 1.96 * unmelted_data["y_std"],
        unmelted_data["y_pred"] + 1.96 * unmelted_data["y_std"],
        color="red", alpha=0.2, label="Confidence Interval"
    )
    
    # Add a vertical line to indicate where validation starts
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label='Validation Start (85%)')
    
    # Enhancing the plot
    plt.title(f"Prediction vs True Values with Confidence Intervals for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the updated plot function
interact(plot_by_country_with_confidence, selected_country=dropdown)
plt.show()


interactive(children=(Dropdown(description='Country:', options=('Canada', 'United States', 'United Kingdom', '…