In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import ipywidgets as widgets
from ipywidgets import interact
import seaborn as sns
from datetime import datetime

from utils.load_data import load_data, load_gt_data
from utils.preprocessing import Preprocessing
from models.MLP import MLP
from models.LinearModels import OLS, RidgeRegression
from models.KalmanFilterMLP import KalmanFilterMLP
import statsmodels.api as sm
import torch
import torch.nn as nn

from tqdm.notebook import tqdm

%reload_ext autoreload
%autoreload 2

TRENDS_FOLDER = 'data/google_trends/'
GDP_FOLDER = 'data/gdp/'
DATA_PREFIX = 'trends_data_by_topic_'

EPS = 1e-15
SEED = 42

## Loading and Preprocessing Data

### Loading Google Trends

In [None]:
TEST_ALL_GT_DATA = False # If set to True, the model will be trained on all available data and predictions will be made for all available GT data

TRAIN_PROPORTION = 0.94 if not TEST_ALL_GT_DATA else 1
PAST_GDPS = [] if not TEST_ALL_GT_DATA else None # e.g. range(1, 3) or [1, 2]
MODE = "pct" # None | "pct" | "diff"
PERIOD = 4  # Year to year prediction

In [None]:
data, all_gdps, all_gts = load_data()

In [None]:
search_terms = [col for col in all_gts.columns if col.endswith('_average')]

processed_gts = all_gts.copy()
processed_gts['date'] = pd.to_datetime(processed_gts['date'])
processed_gts[search_terms] = np.log(processed_gts[search_terms] + 1)

for nb_quarters in range(1, 4 * 2 + 1):
    diff = (processed_gts[search_terms] - processed_gts.groupby("country")[search_terms].diff(3 * nb_quarters)).add_prefix(f'q{nb_quarters}-')
    processed_gts = pd.concat([processed_gts, diff], axis=1)

processed_gts.drop(columns=search_terms, inplace=True)

def get_gt_diff_logs(all_gts):
    processed_gts = all_gts.copy()

    search_terms = [col for col in all_gts.columns if col.endswith('_average')]
    
    processed_gts['date'] = pd.to_datetime(processed_gts['date'])
    processed_gts[search_terms] = np.log(processed_gts[search_terms] + 1)

    for nb_quarters in range(1, 4 * 2 + 1):
        diff = (processed_gts[search_terms] - processed_gts.groupby("country")[search_terms].diff(3 * nb_quarters)).add_prefix(f'q{nb_quarters}-')
        processed_gts = pd.concat([processed_gts, diff], axis=1)

    processed_gts.drop(columns=search_terms, inplace=True)

    return processed_gts



In [None]:
print(f"Data shape: {data.shape}")

data['date'] = pd.to_datetime(data['date'])
processed_gts = processed_gts.dropna()
data_merged = data.merge(processed_gts, left_on=["country", "date"], right_on=["country", "date"], how="left")
data_merged['date'] = pd.to_datetime(data_merged['date']).dt.strftime('%Y-%m-%d')

print(f"Data merged shape: {data_merged.shape}")

In [None]:
splitting_date = data['date'].quantile(TRAIN_PROPORTION)
splitting_date

In [None]:
preprocessor = Preprocessing(data=data_merged, epsilon=EPS, mode=MODE, past_GDP_lags=PAST_GDPS, diff_period=PERIOD, all_GDPs=all_gdps, all_GTs=all_gts)
X_train, y_train, X_valid, y_valid = preprocessor.preprocess_data(train_pct=TRAIN_PROPORTION)

X_train[preprocessor.country_train == "Switzerland"].head()

## Simple Prediction Model

### We start with a simple regression model

In [None]:
x_train = X_train.values
x_valid = X_valid.values
y_train = y_train.values
y_valid = y_valid.values
country_train = preprocessor.country_train
country_valid = preprocessor.country_valid

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 300),
            nn.ReLU(),
            nn.Linear(300, 20),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

    def forward(self, x):
        return self.linear_relu_stack(x)
    
def train_nn(x_train, y_train, x_valid, y_valid, num_epochs=1000, learning_rate=1e-3, weight_decay=1e-3):
    num_features = x_train.shape[1]
    model = NeuralNetwork(num_features=num_features).to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    model.train()
    
    x_train_t = torch.tensor(x_train, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device).unsqueeze(1)
    x_valid_t = torch.tensor(x_valid, dtype=torch.float32).to(device)
    y_valid_t = torch.tensor(y_valid, dtype=torch.float32).to(device).unsqueeze(1)

    training_loss = []
    validation_loss = []
    
    for t in tqdm(range(num_epochs)):
        model.train()
        y_pred = model(x_train_t)
        loss_train = loss_fn(y_pred, y_train_t)

        model.eval()
        loss_valid = loss_fn(model(x_valid_t), y_valid_t)
        model.train()

        training_loss.append(loss_train.item())
        validation_loss.append(loss_valid.item())

        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()
        
    model.eval()
    y_pred = model(x_valid_t)
    loss = loss_fn(y_pred, y_valid_t)
    print(f"Validation loss: {loss.item()}")
    
    return model, training_loss, validation_loss

In [None]:
model, training_loss, validation_loss = train_nn(x_train, y_train, x_valid, y_valid, num_epochs=25, learning_rate=1e-3, weight_decay=1e-4)

In [None]:
from sklearn.metrics import r2_score

def mape_score(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + EPS)))

preds_valid= model(torch.tensor(x_valid, dtype=torch.float32).to(device)).detach().cpu().numpy().squeeze()

scores = []
mape_scores = []
for i in range(2, y_valid.shape[0]):
    r2 = r2_score(y_valid[:i].squeeze(), preds_valid[:i])
    scores.append(r2)
    mape_scores.append(mape_score(y_valid[:i].squeeze(), preds_valid[:i]))

plt.subplot(211)
plt.plot(scores)
plt.subplot(212)
plt.plot(mape_scores)
plt.grid()
plt.ylim(0, 4)
plt.show()

In [None]:
plt.plot(preds_valid, label="Predictions")
plt.plot(y_valid, label="True values")
plt.legend()

In [None]:
import scipy.stats
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(y_valid.squeeze(), preds_valid.squeeze())

In [None]:
slope, intercept, r_value**2, p_value, std_err

In [None]:
preds = model(torch.tensor(x_train, dtype=torch.float32).to(device)).detach().cpu().numpy()

scores = []
for i in range(8, y_train.shape[0]):
    r2 = r2_score(y_train[:i], preds[:i])
    scores.append(r2)

plt.plot(scores)
plt.show()

In [None]:
plt.figure(figsize=(10, 3))
plt.plot(training_loss, label="Training loss")
plt.plot(validation_loss, label="Validation loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
# Get the predictions
x_valid = torch.tensor(x_valid, dtype=torch.float32).to(device)
x_train = torch.tensor(x_train, dtype=torch.float32).to(device)
y_pred = model(x_valid).clone().detach().cpu().numpy().squeeze()
y_pred_train = model(x_train).clone().detach().cpu().numpy().squeeze()

In [None]:
# Associate the result by country
y_pred_country = pd.DataFrame({'date': preprocessor.dates_valid, 'country': country_valid, 'y_pred': y_pred, 'y_true': y_valid})
y_pred_train_country = pd.DataFrame({'date': preprocessor.dates_train, 'country': country_train, 'y_pred': y_pred_train, 'y_true': y_train})
y_pred_train_country

In [None]:
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

In [None]:
# Put together the train and the validation set
predictions = pd.concat([y_pred_train_country, y_pred_country])

# Melting the dataframe for better plotting
predictions_melted = predictions.melt(
    id_vars=["date", "country"], value_vars=["y_pred", "y_true"], 
    var_name="Type", value_name="Value"
)

# Function to plot data for the selected country
def plot_by_country(selected_country):
    filtered_data = predictions_melted[predictions_melted["country"] == selected_country]
    cutoff_date = predictions['date'].quantile(TRAIN_PROPORTION)
    plt.figure(figsize=(12, 6))
    sns.lineplot(
        data=filtered_data,
        x="date", y="Value", hue="Type", style="Type", markers=True, dashes=False, errorbar=None
    )
    plt.title(f"Prediction vs True Values for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label=f'Validation Start ({TRAIN_PROPORTION}%)')
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the plot function
interact(plot_by_country, selected_country=dropdown)

plt.show()

In [None]:
# Put together the train and the validation set
predictions = pd.concat([y_pred_train_country, y_pred_country])

def plot_by_country(selected_country):
    filtered_data = predictions[predictions["country"] == selected_country]
    filtered_data = filtered_data.sort_values(by='date')
    cutoff_date = splitting_date
    plt.figure(figsize=(12, 6))
    
    # Plot the true values
    plt.plot(filtered_data['date'], filtered_data['y_true'], label='True Values', color='blue')
    
    # Plot the predicted mean
    plt.plot(filtered_data['date'], filtered_data['y_pred'], label='Predicted Mean', color='orange')
    
    # Add the prediction interval
    # plt.fill_between(
    #     filtered_data['date'],
    #     filtered_data['y_pred_lower'],
    #     filtered_data['y_pred_upper'],
    #     color='orange', alpha=0.2, label='95% Prediction Interval'
    # )
    
    plt.title(f"Prediction vs True Values for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label=f'Validation Start ({TRAIN_PROPORTION}%)')
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the plot function
interact(plot_by_country, selected_country=dropdown)

plt.show()
