In [10]:
%pip install --quiet sqlalchemy psycopg2-binary pandas tqdm pyarrow ipywidgets scikit-learn
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [11]:
from sqlalchemy import create_engine
import pandas as pd
from collections import Counter
import unicodedata
import math
import os
import numpy as np
from sqlalchemy import text
from tqdm import tqdm
from collections import Counter
import unicodedata
import re
import torch
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm.notebook import tqdm
from collections import Counter
import os
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
# TODO Do not duplicate code for this class
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        # TODO Do we have to do max norm here?
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm=1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(dim=1)
        x = self.linear(x)
        return x

In [None]:
checkpoint = torch.load("cbow_final_with_vocab.pt", map_location=torch.device('cpu'))

word2idx = checkpoint['word2idx']
idx2word = checkpoint['idx2word']

vocab_size = len(word2idx)
embedding_dim = 100
model = CBOW(vocab_size, embedding_dim)

model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [2]:
user = "sy91dhb"
password = "g5t49ao"
host = "178.156.142.230"
port = "5432"
db = "hd64m1ki"

engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")

In [3]:
query = "SELECT title, url, by, time, score FROM hacker_news.items WHERE type='story' AND dead IS NULL"
df = pd.read_sql(query, engine)

In [None]:
df.to_parquet("hackernews_stories.parquet")

In [None]:
df = pd.read_parquet("hackernews_stories.parquet")

In [6]:
df = df[df.score >= 0].reset_index(drop=True)

In [7]:
df['score_log'] = df['score'].apply(lambda x: math.log(x + 1))

In [None]:
class TitleWordIndicesDataset(Dataset):
    def __init__(self, dataframe, word2idx):
        self.samples = []
        for _, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Processing titles"):
            title = row['title']
            score_val = row['score_log']
            if not isinstance(title, str) or pd.isna(score_val):
                continue
            score = torch.tensor(score_val, dtype=torch.float32)
            words = title.lower().split()
            indices = [word2idx[w] for w in words if w in word2idx]
            if indices:
                self.samples.append((indices, score))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [9]:
# Sort the dataframe by time
df_sorted = df.sort_values('time')

# Calculate the split index
split_idx = int(len(df_sorted) * 0.8)

# Split into train and test
df_train = df_sorted.iloc[:split_idx].reset_index(drop=True)
df_test = df_sorted.iloc[split_idx:].reset_index(drop=True)

# Create datasets
title_indices_train = TitleWordIndicesDataset(df_train, word2idx)
title_indices_test = TitleWordIndicesDataset(df_test, word2idx)

Processing titles:   0%|          | 0/3256354 [00:00<?, ?it/s]

Processing titles:   0%|          | 0/814089 [00:00<?, ?it/s]

In [12]:
average_score = df['score_log'].mean()
print(f"Average score: {average_score}")

Average score: 1.5576210926494947


In [13]:
class TitleRegressionNN(nn.Module):
    def __init__(self, embedding_layer, embedding_dim, hidden_dim=512):
        super().__init__()
        # Use the pretrained embedding weights from the CBOW model
        self.embeddings = embedding_layer
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)  # Output is a single numeric value

    def forward(self, input_indices):
        # input_indices: (batch_size, seq_len)
        embeds = self.embeddings(input_indices)  # (batch_size, seq_len, embedding_dim)
        avg_embeds = embeds.mean(dim=1)         # (batch_size, embedding_dim)
        x = self.fc1(avg_embeds)
        x = self.relu(x)
        x = self.fc2(x)
        return x.squeeze(-1)  # (batch_size,)

model_reg = TitleRegressionNN(model.embeddings, embedding_dim)

In [14]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: cuda


In [15]:
# Training settings
batch_size = 128
num_epochs = 3
learning_rate = 1e-3

# Move model to device
model_reg = model_reg.to(device)

# Prepare DataLoader with padding for variable-length sequences
def collate_fn(batch):
    # batch is a list of (indices, score) tuples
    indices_list, scores_list = zip(*batch)
    lengths = [len(x) for x in indices_list]
    max_len = max(lengths)
    padded = [x + [0]*(max_len - len(x)) for x in indices_list]
    padded_indices = torch.tensor(padded, dtype=torch.long)
    scores = torch.tensor(scores_list, dtype=torch.float32)
    return padded_indices, scores

train_loader = DataLoader(title_indices_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Prepare target scores
train_scores = torch.tensor(df_train['score_log'].values, dtype=torch.float32)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model_reg.parameters(), lr=learning_rate)

model_reg.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (batch_indices, batch_scores) in enumerate(train_loader):
        batch_indices, batch_scores = batch_indices.to(device), batch_scores.to(device)

        optimizer.zero_grad()
        outputs = model_reg(batch_indices)
        loss = criterion(outputs, batch_scores)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if (i + 1) % 100 == 0:
            avg_loss = running_loss / 100
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {avg_loss:.4f}")
            running_loss = 0.0

Epoch [1/3], Step [100/25252], Loss: 3853.3601
Epoch [1/3], Step [200/25252], Loss: 3462.7609
Epoch [1/3], Step [300/25252], Loss: 3211.3832
Epoch [1/3], Step [400/25252], Loss: 3677.0475
Epoch [1/3], Step [500/25252], Loss: 3591.5478
Epoch [1/3], Step [600/25252], Loss: 3307.8446
Epoch [1/3], Step [700/25252], Loss: 4321.3422
Epoch [1/3], Step [800/25252], Loss: 3307.4646
Epoch [1/3], Step [900/25252], Loss: 3335.7904
Epoch [1/3], Step [1000/25252], Loss: 3269.4956
Epoch [1/3], Step [1100/25252], Loss: 3195.2012
Epoch [1/3], Step [1200/25252], Loss: 3826.9996
Epoch [1/3], Step [1300/25252], Loss: 3981.0746
Epoch [1/3], Step [1400/25252], Loss: 4227.4546
Epoch [1/3], Step [1500/25252], Loss: 3346.5518
Epoch [1/3], Step [1600/25252], Loss: 3162.9413
Epoch [1/3], Step [1700/25252], Loss: 3678.6170
Epoch [1/3], Step [1800/25252], Loss: 3114.2157
Epoch [1/3], Step [1900/25252], Loss: 3955.8900
Epoch [1/3], Step [2000/25252], Loss: 6528.0529
Epoch [1/3], Step [2100/25252], Loss: 3312.9487
E

In [16]:
# Prepare test DataLoader
test_loader = DataLoader(title_indices_test, batch_size=256, shuffle=False, collate_fn=collate_fn)

model_reg.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for batch_indices, batch_scores in tqdm(test_loader, desc="Evaluating"):
        batch_indices = batch_indices.to(device)
        outputs = model_reg(batch_indices).cpu().numpy()
        all_preds.extend(outputs)
        all_targets.extend(batch_scores.numpy())

all_preds = np.array(all_preds)
all_targets = np.array(all_targets)

# Baseline: always predict average score from training set
average_score = df_train['score_log'].mean()
baseline_preds = np.full_like(all_targets, average_score)

# Metrics
mse_model = mean_squared_error(all_targets, all_preds)
mse_baseline = mean_squared_error(all_targets, baseline_preds)

r2_model = r2_score(all_targets, all_preds)
r2_baseline = r2_score(all_targets, baseline_preds)

mae_model = mean_absolute_error(all_targets, all_preds)
mae_baseline = mean_absolute_error(all_targets, baseline_preds)

print(f"Model MSE: {mse_model:.4f}, Baseline MSE: {mse_baseline:.4f}")
print(f"Model MAE: {mae_model:.4f}, Baseline MAE: {mae_baseline:.4f}")
print(f"Model R2: {r2_model:.4f}, Baseline R2: {r2_baseline:.4f}")

Evaluating:   0%|          | 0/3147 [00:00<?, ?it/s]

Model MSE: 5592.7798, Baseline MSE: 5998.0386
Model MAE: 25.6618, Baseline MAE: 19.1959
Model R2: 0.0084, Baseline R2: -0.0635
