In [9]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import random

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shuxu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:

train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("val.csv")
test_df = pd.read_csv("test.csv")


In [11]:

all_sentences = train_df['query'].tolist() + val_df['query'].tolist() + test_df['query'].tolist()
tokenized = [word_tokenize(sent.lower()) for sent in all_sentences]

w2v_model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=1, workers=4)


In [12]:

vocab = {"<PAD>": 0, "<UNK>": 1}
for sent in tokenized:
    for word in sent:
        if word not in vocab:
            vocab[word] = len(vocab)

embedding_matrix = np.zeros((len(vocab), 100))
for word, idx in vocab.items():
    if word in w2v_model.wv:
        embedding_matrix[idx] = w2v_model.wv[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(100,))


In [13]:

class TextDataset(Dataset):
    def __init__(self, queries, labels, vocab, max_len=20):
        self.data = []
        for q, label in zip(queries, labels):
            tokens = word_tokenize(q.lower())
            indexed = [vocab.get(word, vocab["<UNK>"]) for word in tokens][:max_len]
            if len(indexed) < max_len:
                indexed += [vocab["<PAD>"]] * (max_len - len(indexed))
            self.data.append((torch.tensor(indexed), torch.tensor(label, dtype=torch.float)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


In [19]:

import torch.nn as nn
import torch

class LSTMRegressor(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=64, num_layers=2, dropout=0.2, bidirectional=False):
        super(LSTMRegressor, self).__init__()
        vocab_size, embed_dim = embedding_matrix.shape

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight.data.copy_(torch.tensor(embedding_matrix))
        self.embedding.weight.requires_grad = True  # Enable fine-tuning

        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional
        )

        lstm_output_dim = hidden_dim * (2 if bidirectional else 1)
        self.fc = nn.Linear(lstm_output_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        out = hn[-1]
        return self.fc(out).squeeze(1)


In [20]:

class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, yhat, y):
        return torch.sqrt(self.mse(yhat, y))


In [23]:

batch_size = 32
epochs = 100

train_dataset = TextDataset(train_df['query'], train_df['carb'], vocab)
val_dataset = TextDataset(val_df['query'], val_df['carb'], vocab)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

model = LSTMRegressor(embedding_matrix, num_layers = 5).to(device)
criterion = RMSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(x_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Train RMSE: {total_loss / len(train_loader):.4f}")

    # Validation RMSE
    model.eval()
    val_preds, val_targets = [], []
    with torch.no_grad():
        for x_val, y_val in val_loader:
            x_val, y_val = x_val.to(device), y_val.to(device)
            preds = model(x_val)
            val_preds.append(preds.cpu())
            val_targets.append(y_val.cpu())

    val_preds = torch.cat(val_preds)
    val_targets = torch.cat(val_targets)
    val_rmse = torch.sqrt(nn.MSELoss()(val_preds, val_targets))
    print(f"           Val RMSE: {val_rmse.item():.4f}")


Epoch 1, Train RMSE: 35.0429
           Val RMSE: 39.7085
Epoch 2, Train RMSE: 34.3569
           Val RMSE: 39.5235
Epoch 3, Train RMSE: 34.5364
           Val RMSE: 39.5185
Epoch 4, Train RMSE: 34.3208
           Val RMSE: 39.4944
Epoch 5, Train RMSE: 34.4210
           Val RMSE: 39.4568
Epoch 6, Train RMSE: 34.4950
           Val RMSE: 39.4665
Epoch 7, Train RMSE: 34.6271
           Val RMSE: 39.4815
Epoch 8, Train RMSE: 34.7914
           Val RMSE: 38.8438
Epoch 9, Train RMSE: 32.3220
           Val RMSE: 37.9220
Epoch 10, Train RMSE: 32.2038
           Val RMSE: 37.6684
Epoch 11, Train RMSE: 31.2227
           Val RMSE: 36.1114
Epoch 12, Train RMSE: 30.1810
           Val RMSE: 35.9601
Epoch 13, Train RMSE: 30.1940
           Val RMSE: 36.1214
Epoch 14, Train RMSE: 29.5998
           Val RMSE: 36.0142
Epoch 15, Train RMSE: 30.2201
           Val RMSE: 36.0208
Epoch 16, Train RMSE: 32.0304
           Val RMSE: 37.5401
Epoch 17, Train RMSE: 32.3447
           Val RMSE: 36.3528
Epoch 