In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from utils.load_data import load_data
from utils.preprocessing import preprocess_data
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torchvision import datasets, transforms

%reload_ext autoreload
%autoreload 2

TRENDS_FOLDER = 'data/google_trends/'
GDP_FOLDER = 'data/gdp/'
DATA_PREFIX = 'trends_data_by_topic_'

EPS = 1e-15
SEED = 42

## Loading and Preprocessing Data

### Loading Google Trends

In [None]:
data = load_data()
X, y, countries, y_mean, y_std = preprocess_data(data=data, epsilon=EPS)
X.shape, y.shape

In [None]:
X.head()

In [None]:
countries

## Simple Prediction Model

### We start with a simple regression model

In [None]:
percent_train = 0.9
number_train = int(len(X) * percent_train)

In [None]:
x_values = X.values
y_values = y.values

# Add previous GDP as input
x_values = np.hstack([x_values[1:], y_values[:-1][:, None], x_values[:-1]])
y_values = y_values[1:]

In [None]:
x_train = X.values[:number_train, :]
y_train = y.values[:number_train]
x_valid = X.values[number_train:, :]
y_valid = y.values[number_train:]
country_train = countries.values[:number_train]
country_valid = countries.values[number_train:]

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
def spectral_norm(module, lip):
    if isinstance(module, nn.Linear):
        with torch.no_grad():
            # Compute the largest singular value (spectral norm) of the weight matrix
            sigma = torch.linalg.norm(module.weight, ord=2)
            # Scale the weight matrix to have spectral norm equal to 'lip'
            if sigma > EPS:
                scaling_factor = lip / sigma
                module.weight.mul_(scaling_factor)

def enforce_lipschitz(model, lip):
    model.apply(lambda x: spectral_norm(x, lip))

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_features, 100),
            nn.ReLU(),
            nn.Linear(100, 20),
            nn.ReLU(),
            nn.Linear(20, 1)
        )

    def forward(self, x):
        return self.linear_relu_stack(x)
    
def train_nn(x_train, y_train, x_valid, y_valid, num_epochs=1000, learning_rate=1e-3, weight_decay=1e-5, lipschitz=None):
    num_features = x_train.shape[1]
    model = NeuralNetwork(num_features=num_features).to(device)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    model.train()
    
    x_train_t = torch.tensor(x_train, dtype=torch.float32).to(device)
    y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device).unsqueeze(1)
    x_valid_t = torch.tensor(x_valid, dtype=torch.float32).to(device)
    y_valid_t = torch.tensor(y_valid, dtype=torch.float32).to(device).unsqueeze(1)
    
    for t in tqdm(range(num_epochs)):
        model.train()
        y_pred = model(x_train_t)
        loss = loss_fn(y_pred, y_train_t)
        if t % 500 == 499:
            print(t, loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        if lipschitz is not None:
            enforce_lipschitz(model, lipschitz)
        
    model.eval()
    y_pred = model(x_valid_t)
    loss = loss_fn(y_pred, y_valid_t)
    print(f"Validation loss: {loss.item()}")
    
    return model, loss.item(), y_pred

In [None]:
def smoothness_measure(sequence, real_sequence):
    with torch.no_grad():
        diff_diff = torch.diff(sequence.squeeze()) - torch.diff(real_sequence.squeeze())
        return (torch.linalg.norm(diff_diff) / diff_diff.shape[0]).clone().detach().cpu().numpy()

In [None]:
y_valid_t = torch.tensor(y_valid, dtype=torch.float32).to(device).unsqueeze(1)
losses = []

parameter_space = np.linspace(1e-1, 1, 30)
for l in parameter_space:
    model, loss, y_pred = train_nn(x_train, y_train, x_valid, y_valid, num_epochs=1000, learning_rate=1e-3, weight_decay=1e-5, lipschitz=l)

    smoothness = smoothness_measure(y_pred, y_valid_t)

    print(f"Lipschitz: {l}, Loss: {loss}, Smoothness: {smoothness}")
    losses.append({'lip': l, 'loss': loss, 'model': model, 'smoothness': smoothness})

In [None]:
# Plot with different y axis
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Lipschitz')
ax1.set_ylabel('Loss', color=color)

ax1.plot([l['lip'] for l in losses], [l['loss'] for l in losses], color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Smoothness', color=color)

ax2.plot([l['lip'] for l in losses], [l['smoothness'] for l in losses], color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.show()

In [None]:
parameter_space

In [None]:
smallest_smoothness = np.argmin([l['smoothness'] for l in losses])
smallest_loss = np.argmin([l['loss'] for l in losses])

print(f"Smallest smoothness: {losses[smallest_smoothness]}")
print(f"Smallest loss: {losses[smallest_loss]}")

model = losses[smallest_loss]['model']

In [None]:
model = losses[0]['model']

# Get the predictions
x_valid = torch.tensor(x_valid, dtype=torch.float32).to(device)
x_train = torch.tensor(x_train, dtype=torch.float32).to(device)

y_pred = model(x_valid).clone().detach().cpu().numpy().squeeze()
y_pred_train = model(x_train).clone().detach().cpu().numpy().squeeze()

In [None]:
# Associate the result by country
y_pred_country = pd.DataFrame({'date': X['date'][number_train:], 'country': country_valid, 'y_pred': y_pred, 'y_true': y_valid})
y_pred_train_country = pd.DataFrame({'date': X['date'][:number_train], 'country': country_train, 'y_pred': y_pred_train, 'y_true': y_train})
y_pred_train_country

In [None]:
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

In [None]:
# Put together the train and the validation set
predictions = pd.concat([y_pred_train_country, y_pred_country])

# Melting the dataframe for better plotting
predictions_melted = predictions.melt(
    id_vars=["date", "country"], value_vars=["y_pred", "y_true"], 
    var_name="Type", value_name="Value"
)



# Function to plot data for the selected country
def plot_by_country(selected_country):
    filtered_data = predictions_melted[predictions_melted["country"] == selected_country]
    cutoff_date = predictions['date'].quantile(percent_train)
    plt.figure(figsize=(12, 6))
    sns.lineplot(
        data=filtered_data,
        x="date", y="Value", hue="Type", style="Type", markers=True, dashes=False
    )
    plt.title(f"Prediction vs True Values for {selected_country}")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.axvline(x=cutoff_date, color='red', linestyle='--', label=f'Validation Start ({percent_train}%)')
    plt.legend(title="Legend")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Create a dropdown widget for selecting the country
countries = predictions["country"].unique()
dropdown = widgets.Dropdown(
    options=countries,
    value=countries[0],
    description='Country:'
)

# Use the interact function to link the dropdown with the plot function
interact(plot_by_country, selected_country=dropdown)

plt.show()