In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import ipywidgets as widgets
from ipywidgets import interact
import seaborn as sns
from datetime import datetime

from utils.load_data import load_data, load_gt_data
from utils.preprocessing import Preprocessing
from models.MLP import MLP
from models.LinearModels import OLS, RidgeRegression
from models.KalmanFilterMLP import KalmanFilterMLP
import statsmodels.api as sm

%reload_ext autoreload
%autoreload 2

TRENDS_FOLDER = 'data/google_trends/'
GDP_FOLDER = 'data/gdp/'
DATA_PREFIX = 'trends_data_by_topic_'

EPS = 1e-15
SEED = 42

## Loading and Preprocessing Data

In [None]:
SEED = 0
EPS = 1e-6

TEST_ALL_GT_DATA = False # If set to True, the model will be trained on all available data and predictions will be made for all available GT data

TRAIN_PROPORTION = 0.75 if not TEST_ALL_GT_DATA else 1
PAST_GDPS = [] if not TEST_ALL_GT_DATA else None # e.g. range(1, 3) or [1, 2]
MODE = "pct" # None | "pct" | "diff"
PERIOD = 4  # Year to year prediction

In [None]:
data, all_gdps = load_data()

print(f"Data shape: {data.shape}")


preprocessor = Preprocessing(data=data, epsilon=EPS, mode=MODE, past_GDP_lags=PAST_GDPS, diff_period=PERIOD, all_GDPs=all_gdps)
X_train, y_train, X_valid, y_valid = preprocessor.preprocess_data(train_pct=TRAIN_PROPORTION)

X_train[preprocessor.country_train == "Switzerland"].head()

## Simple Prediction Model

### Using OLS

In [None]:
X_train["month"] = preprocessor.dates_train.apply(lambda x: x.month)
X_valid["month"] = preprocessor.dates_valid.apply(lambda x: x.month)
X_train = pd.get_dummies(X_train, columns=["month"], dtype=float)
X_valid = pd.get_dummies(X_valid, columns=["month"], dtype=float)
X_train.head()

In [None]:
fitted_model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
print(fitted_model.summary())

In [None]:
for feature in fitted_model.pvalues[fitted_model.pvalues < 0.05].index:
    print(f"Relevant feature: {feature}")

There is a strong multicollinearity between the features, so we can't really conclude anything from the OLS model and the significance of the features.

### We start with a simple regression model

In [None]:
# Add bias term
x_train = np.hstack([X_train.values, np.ones((len(X_train), 1))])
x_valid = np.hstack([X_valid.values, np.ones((len(X_valid), 1))])

In [None]:
weights = np.linalg.lstsq(x_train, y_train, rcond=1e-15)[0]
y_pred = weights @ x_valid.T
y_pred_train = weights @ x_train.T
np.linalg.norm(y_pred - y_valid) / y_valid.size

In [None]:
# Associate the result by country
y_pred_country = pd.DataFrame({'date': X_valid['date'], 'country': preprocessor.country_valid, 'y_pred': y_pred, 'y_true': y_valid})
y_pred_train_country = pd.DataFrame({'date': X_train['date'], 'country': preprocessor.country_train, 'y_pred': y_pred_train, 'y_true': y_train})
y_pred_train_country

In [None]:
import seaborn as sns

In [None]:
# Put together the train and the validation set
predictions = pd.concat([y_pred_train_country, y_pred_country])

# Melting the dataframe for better plotting
predictions_melted = predictions.melt(
    id_vars=["date", "country"], value_vars=["y_pred", "y_true"], 
    var_name="Type", value_name="Value"
)

import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

# Function to plot data for the selected country
def plot_by_country(selected_country):
    filtered_data = predictions_melted[predictions_melted["country"] == selected_country]
    cutoff_date = predictions['date'].quantile(TRAIN_PROPORTION)
    plt.figure(figsize=(12, 6))
    sns.lineplot(
        data=filtered_data,
        x="date", y="Value", hue="Type", style="Type", markers=True, dashes=False
    )
    plt.title(f"Prediction vs True Values for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label=f'Validation Start ({TRAIN_PROPORTION}%)')
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the plot function
interact(plot_by_country, selected_country=dropdown)


## Trying Gaussian Process

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, WhiteKernel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the kernel with hyperparameter bounds
kernel = C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-3, 1e3)) + WhiteKernel()

# Create the GaussianProcessRegressor
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)

# Define parameter grid for cross-validation
param_grid = {
    "kernel__k1__k1__constant_value": [0.1, 1, 10, 100],
    "kernel__k1__k1__constant_value_bounds": [(1e-3, 1e3)],
    "kernel__k1__k2__length_scale": [0.1, 1, 10, 100],
    "kernel__k1__k2__length_scale_bounds": [(1e-3, 1e3)],
    "alpha": [1e-2, 1e-1, 1],
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(gp, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Best parameters
print("Best parameters found:")
print(grid_search.best_params_)

In [None]:
gp.set_params(**grid_search.best_params_)
gp.fit(x_train, y_train)

# Predict on the validation set
y_pred, sigma = gp.predict(x_valid, return_std=True)
y_pred_train, sigma_train = gp.predict(x_train, return_std=True)

In [None]:
# Calculate Mean Squared Error
mse_train = mean_squared_error(y_train, y_pred_train)
mse_valid = mean_squared_error(y_valid, y_pred)
print(f"Training MSE: {mse_train:.4f}")
print(f"Validation MSE: {mse_valid:.4f}")

In [None]:
# Associate the result by country
y_pred_country = pd.DataFrame({'date': X_valid['date'], 'country': preprocessor.country_valid, 'y_pred': y_pred, 'y_true': y_valid, 'y_std': sigma})
y_pred_train_country = pd.DataFrame({'date': X_train['date'], 'country': preprocessor.country_train, 'y_pred': y_pred_train, 'y_true': y_train, 'y_std': sigma_train})
y_pred_train_country

# Put together the train and the validation set
predictions = pd.concat([y_pred_train_country, y_pred_country])

# Melting the dataframe for better plotting
predictions_melted = predictions.melt(
    id_vars=["date", "country"], value_vars=["y_pred", "y_true"], 
    var_name="Type", value_name="Value"
)


In [None]:
# Function to plot data with confidence intervals for the selected country
def plot_by_country_with_confidence(selected_country):
    filtered_data = predictions_melted[predictions_melted["country"] == selected_country]
    cutoff_date = predictions['date'].quantile(TRAIN_PROPORTION)


    unmelted_data = predictions[(predictions["country"] == selected_country)]
    
    plt.figure(figsize=(12, 6))
    
    # Plot predictions and true values
    sns.lineplot(
        data=filtered_data,
        x="date", y="Value", hue="Type", style="Type", markers=True, dashes=False
    )
    
    plt.fill_between(
        unmelted_data["date"],
        unmelted_data["y_pred"] - 1.96 * unmelted_data["y_std"],
        unmelted_data["y_pred"] + 1.96 * unmelted_data["y_std"],
        color="red", alpha=0.2, label="Confidence Interval"
    )
    
    # Add a vertical line to indicate where validation starts
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label=f'Validation Start ({TRAIN_PROPORTION}%)')
    
    # Enhancing the plot
    plt.title(f"Prediction vs True Values with Confidence Intervals for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the updated plot function
interact(plot_by_country_with_confidence, selected_country=dropdown)
plt.show()
