In [1]:
from torch import Tensor
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from utils import plot_results


def accuracy(model, dataset: Dataset) -> float:
    """
    copied from csc413 lab 1
    Compute the accuracy of `model` over the `dataset`.
    We will take the **most probable class**
    as the class predicted by the model.

    Parameters:
        `model` - A torch.nn model. We will only be passing `nn.Linear` models.
                  However, to make your code more generally useful, do not access
                  `model.weight` and `model.bias` parameters directly. These
                  class attributes may not exist for other kinds of models.
        `dataset` - A list of 2-tuples of the form (x, t), where `x` is a PyTorch
                  tensor of shape [1, 28, 28] representing an MNIST image,
                  and `t` is the corresponding target label

    Returns: a floating-point value between 0 and 1.
    """
    total = 0
    distance = 0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    for i in range(500):
        data = dataset[i]

        input_ids = data['input_ids'].unsqueeze(0).to(device)
        attention_mask = data['attention_mask'].unsqueeze(0).to(device)
        categorical_features = data['categorical_features'].unsqueeze(0).to(device)
        label = data['labels']

        output = model(input_ids, attention_mask, categorical_features)
        output = output.item()

        distance += float(abs(label-output))
        total += 1

    return distance / total

class GPT1Dataset(Dataset):
    def __init__(self, input_ids: list[Tensor],
                 attention_mask: list[Tensor],
                 categorical_features: list[Tensor],
                 labels: list[float]):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.categorical_features = categorical_features
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'categorical_features': self.categorical_features[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

def _collate_batch(batch):
    """Custom collate function for handling batches of data where all input tensors are of the same length."""

    # Separate and stack the data directly since all tensors are already of the same length
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    categorical_features = torch.stack([item['categorical_features'] for item in batch]).float()
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.float)

    return input_ids, attention_mask, categorical_features, labels

def train_model(model,
                train_data: GPT1Dataset,
                val_data: GPT1Dataset,
                learning_rate=0.01,
                batch_size=100,
                num_epochs=10,
                plot_every=50,
                plot=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)

    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=_collate_batch)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    iters, losses, train_mae, val_mae = [], [], [], []
    iter_count = 0

    for epoch in range(num_epochs):
        model.train()
        for input_ids, attention_mask, categorical_features, label in train_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            categorical_features = categorical_features.to(device)
            label = label.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, categorical_features)
            outputs = outputs.squeeze()
            loss = criterion(outputs, label.float())
            loss.backward()
            optimizer.step()

            if (iter_count + 1) % plot_every == 0:
                iters.append(iter_count)
                losses.append(float(loss))
                train_mae.append(accuracy(model, train_data))
                val_mae.append(accuracy(model, val_data))
                print(
                    f"Iter {iter_count + 1}: Loss: {losses[-1]} Train mae {train_mae[-1]}, Validation mae {val_mae[-1]}")
            iter_count += 1

    if plot:
        plot_results(iters, losses, train_mae, val_mae)

In [2]:
from utils import read_csv_data, clean_location

data = read_csv_data("../data/processed_job_postings_large.csv", 
                     ["industry", "work_type", "location", "formatted_experience_level",
                      "name", "cleaned_title", "cleaned_description"],
                     "standardized_annual_salary")
data = clean_location(data, 2)
import random
random.seed(42)
random.shuffle(data)

train_data = data[:20000]
val_data = data[25000:30000]
test_data = data[30000:]

In [3]:
from utils import build_column_vocabulary

vocab_industry = build_column_vocabulary(train_data, 0)
vocab_type = build_column_vocabulary(train_data, 1)
vocab_state = build_column_vocabulary(train_data, 2)
vocab_level = build_column_vocabulary(train_data, 3)

In [4]:
from utils import convert_to_one_hot
import torch

train_cat_features = convert_to_one_hot(train_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

val_cat_features = convert_to_one_hot(val_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

# Convert Lists to Tensors
train_cat_features = torch.stack(train_cat_features)
val_cat_features = torch.stack(val_cat_features)

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right" 
a = tokenizer('hello this is a test',
         truncation=True,
         padding='max_length',
         max_length=512,
         return_tensors="pt")
a

{'input_ids': tensor([[ 101, 7592, 2023, 2003, 1037, 3231,  102,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,  100,
          100,

In [6]:
descriptions = [item[0][6] for item in train_data]
input_ids, attention_mask = [], []
for description in descriptions:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids)
attention_mask = torch.stack(attention_mask)

labels = [float(target) for _, target in train_data]

train_dataset = GPT1Dataset(input_ids, attention_mask, train_cat_features, labels)

In [7]:
train_dataset[0]

{'input_ids': tensor([  101,  3353,  2622, 10489,  2024,  3625,  2005,  4041,  1998, 19642,
          3934,  2000,  5676,  2027,  2024,  2949,  1999,  1037, 23259,  4827,
          1998,  2306,  5166,  2551,  4077,  1996,  2622,  3208,  2000,  2933,
          1998, 24414,  2622,  4219,  7374, 26178,  8080,  5082,  1998,  2562,
         22859,  6727,  1996,  2972,  2126, 20464, 11638,  4751,  8162,  7396,
          2003,  2028,  1997,  1996,  2327,  4396,  2236, 16728,  1999,  1996,
          5519,  3655,  2027,  2024,  2559,  2005,  2019,  5281,  3353,  2622,
          3208,  2000,  3693,  2037,  3652,  2136,  2007,  2019, 11757,  7965,
         13117,  2005, 16798,  2549,  1998,  3458,  2023, 10887,  1999,  9897,
          2005,  2037,  4800,  7011,  4328,  2135,  1998,  3293,  2810,  2136,
          6155, 23235,  3258,  9006, 10814,  4570,  4475, 19744,  2015,  1998,
         21009,  3465, 10035,  2890,  8584,  2015,  2622,  6378,  9254,  1998,
          3488,  2000,  5646,  2051,  4

In [8]:
descriptions = [item[0][6] for item in val_data]
input_ids, attention_mask = [], []
for description in descriptions:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids)
attention_mask = torch.stack(attention_mask)

labels = [float(target) for _, target in val_data]

val_dataset = GPT1Dataset(input_ids, attention_mask, val_cat_features, labels)

In [9]:
train_dataset[0]["input_ids"].shape

torch.Size([512])

In [10]:
from bert import Bert

model = Bert(len(vocab_type) + len(vocab_industry) + len(vocab_state) + len(vocab_level), hidden_size=200)

train_model(model, train_data=train_dataset, val_data=train_dataset, batch_size=20)

Iter 50: Loss: 4200276736.0 Train mae 70497.21056687502, Validation mae 70414.8842465625
Iter 100: Loss: 1380723072.0 Train mae 40022.00003312499, Validation mae 39641.79265031249
Iter 150: Loss: 2900397568.0 Train mae 39036.33565031249, Validation mae 39285.11362687499
Iter 200: Loss: 1590459008.0 Train mae 40567.50217374999, Validation mae 40274.77172062499
Iter 250: Loss: 1934695424.0 Train mae 39656.12401749999, Validation mae 39328.94261124999
Iter 300: Loss: 1514535936.0 Train mae 39675.96834562499, Validation mae 39341.75668937499
Iter 350: Loss: 5450913792.0 Train mae 40652.181517499994, Validation mae 40678.81754874999


KeyboardInterrupt: 

In [25]:
torch.cuda.empty_cache()

In [12]:
model.fc1.weight.shape

torch.Size([200, 1075])