In [None]:
%load_ext autoreload
%autoreload 
%cd ..

In [None]:
from finn_deals.scraping.finn import FinnAPI

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timezone
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from dataclasses import dataclass

In [None]:
api = FinnAPI()

In [None]:

df = api.search_dataframe_sharded(query="gitar")
df["timestamp"] = pd.to_datetime(df["timestamp"], format="ISO8601")
df["time_since_posted"] = datetime.now(timezone.utc) - df["timestamp"]
df

In [None]:
from finn_deals.modeling.utils import WordPieceTokenizer, Log1pMinMaxScaler
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from typing import List, Iterable, Dict

tokenizer = WordPieceTokenizer()

# Build vocab from dataframe titles
vocab = " ".join(df["title"].str.lower().str.replace(r"[^a-z0-9]+", " ", regex=True).str.strip().tolist()).split()

tokenizer.fit(vocab)

X = tokenizer(df["title"])

scaler = Log1pMinMaxScaler()
y = scaler.fit_transform(df["price_amount"].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train = torch.nested.nested_tensor(X_train, layout=torch.jagged)
# X_test = torch.nested.nested_tensor(X_test, layout=torch.jagged)
X_train = pad_sequence(
    [torch.tensor(x, dtype=torch.long) for x in X_train],
    batch_first=True,
    padding_value=tokenizer.pad_token_id,
)

X_test = pad_sequence(
    [torch.tensor(x, dtype=torch.long) for x in X_test],
    batch_first=True,
    padding_value=tokenizer.pad_token_id,
)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

print(pd.Series(vocab).value_counts())

In [None]:
X_test.shape

In [None]:
from finn_deals.modeling.nn import SentimentModel, SentimentModelConfig
cfg = SentimentModelConfig(
    vocab_size = len(tokenizer),
    seq_max_len=100,
    positional_encoding_type="sinusoidal",
    embedding_dim=32,
    padding_idx = tokenizer.pad_token_id,
    mlp_hidden_layers=[64],
)
model = SentimentModel(cfg)
print(cfg)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentimentModel().to(device)

X_train = X_train.to(device)
y_train = y_train.to(device).float()

X_test = X_test.to(device)
y_test = y_test.to(device).float()

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_iterations = 5000
patience = 50
best_val_loss = float('inf')
steps_without_improvement = 0

for it in range(1, num_iterations + 1):

    # ---- train step ----
    model.train()
    optimizer.zero_grad()
    pred = model(X_train)
    loss = criterion(pred, y_train)
    loss.backward()
    optimizer.step()

    # ---- validation / test loss ----
    model.eval()
    with torch.no_grad():
        test_pred = model(X_test)
        test_loss = criterion(test_pred, y_test)

    # ---- early stopping ----
    if test_loss.item() < best_val_loss:
        best_val_loss = test_loss.item()
        steps_without_improvement = 0
        best_state_dict = {k: v.clone() for k, v in model.state_dict().items()}
    else:
        steps_without_improvement += 1

    # ---- logging ----
    if it % 10 == 0:
        rmse = math.sqrt(loss.item())
        test_rmse = math.sqrt(test_loss.item())
        print(
            f"Iter {it:05d} | Train RMSE: {rmse:.4f} | Test RMSE: {test_rmse:.4f}"
        )

    if steps_without_improvement >= patience:
        print(f"Early stopping at iteration {it}")
        break

# restore best model
model.load_state_dict(best_state_dict)
print(f"Best Test RMSE: {math.sqrt(best_val_loss):.4f}")

pred = model(X_test).cpu().detach().numpy()


In [None]:

plt.hist(y_test.cpu().detach().numpy() - pred, bins=50)
plt.yscale("log")
plt.show()


plt.hist(scaler.untransform(y_test.cpu().detach().numpy()) - scaler.untransform(pred), bins=50)
plt.yscale("log")
plt.show()



In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

print(f"Nominal R²: {r2_score(scaler.untransform(y_test.cpu().detach().numpy()), scaler.untransform(pred)):.2%}")
print(f"Log1p R²: {r2_score(y_test.cpu().detach().numpy(), pred):.2%}")

print(f"Nominal MAE: {mean_absolute_error(scaler.untransform(y_test.cpu().detach().numpy()), scaler.untransform(pred)):,.2f}")