In [21]:
from utils import read_csv_data, clean_location

data = read_csv_data("../data/processed_job_postings_large.csv", 
                     ["industry", "work_type", "location", "formatted_experience_level",
                      "name", "cleaned_title", "cleaned_description"],
                     "standardized_annual_salary")
data = clean_location(data, 2)
import random
random.seed(42)
random.shuffle(data)

train_data = data[:20000]
val_data = data[20000:30000]
test_data = data[30000:]

In [22]:
from utils import build_column_vocabulary

vocab_industry = build_column_vocabulary(train_data, 0)
vocab_type = build_column_vocabulary(train_data, 1)
vocab_state = build_column_vocabulary(train_data, 2)
vocab_level = build_column_vocabulary(train_data, 3)

In [23]:
from utils import convert_to_one_hot
import torch

train_cat_features = convert_to_one_hot(train_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

val_cat_features = convert_to_one_hot(val_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

# Convert Lists to Tensors
train_cat_features = torch.stack(train_cat_features)
val_cat_features = torch.stack(val_cat_features)

In [24]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right" 
a = tokenizer('hello this is a test',
         truncation=True,
         padding='max_length',
         max_length=512,
         return_tensors="pt")

In [25]:
from gpt1 import GPT1Dataset

descriptions = [item[0][6] for item in train_data]
input_ids, attention_mask = [], []
for description in descriptions:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids)
attention_mask = torch.stack(attention_mask)

labels = [float(target) for _, target in train_data]

train_dataset = GPT1Dataset(input_ids, attention_mask, train_cat_features, labels)

In [26]:
descriptions = [item[0][6] for item in val_data]
input_ids, attention_mask = [], []
for description in descriptions:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids)
attention_mask = torch.stack(attention_mask)

labels = [float(target) for _, target in val_data]

val_dataset = GPT1Dataset(input_ids, attention_mask, val_cat_features, labels)

In [27]:
train_dataset[0]["input_ids"].shape

torch.Size([512])

In [28]:
from gpt1 import GPT1

model = GPT1(len(vocab_type) + len(vocab_industry) + len(vocab_state) + len(vocab_level))

In [29]:
from torch.utils.data import Dataset
def accuracy(model, dataset: Dataset) -> float:
    """
    copied from csc413 lab 1
    Compute the accuracy of `model` over the `dataset`.
    We will take the **most probable class**
    as the class predicted by the model.

    Parameters:
        `model` - A torch.nn model. We will only be passing `nn.Linear` models.
                  However, to make your code more generally useful, do not access
                  `model.weight` and `model.bias` parameters directly. These
                  class attributes may not exist for other kinds of models.
        `dataset` - A list of 2-tuples of the form (x, t), where `x` is a PyTorch
                  tensor of shape [1, 28, 28] representing an MNIST image,
                  and `t` is the corresponding target label

    Returns: a floating-point value between 0 and 1.
    """
    total = 0
    distance = 0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    for i in range(500):
        data = dataset[i]

        input_ids = data['input_ids'].unsqueeze(0).to(device)
        attention_mask = data['attention_mask'].unsqueeze(0).to(device)
        categorical_features = data['categorical_features'].unsqueeze(0).to(device)
        label = data['labels']

        output = model(input_ids, attention_mask, categorical_features)
        output = output.item()

        distance += float(abs(label-output))
        total += 1

    return distance / total

In [30]:
from torch import Tensor
class GPT1Dataset(Dataset):
    def __init__(self, input_ids: list[Tensor],
                 attention_mask: list[Tensor],
                 categorical_features: list[Tensor],
                 labels: list[float]):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.categorical_features = categorical_features
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'categorical_features': self.categorical_features[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

In [31]:
from torch.utils.data import DataLoader
from torch import nn, optim
from utils import plot_results

def _collate_batch(batch):
    """Custom collate function for handling batches of data where all input tensors are of the same length."""

    # Separate and stack the data directly since all tensors are already of the same length
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    categorical_features = torch.stack([item['categorical_features'] for item in batch]).float()
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.float)

    return input_ids, attention_mask, categorical_features, labels

def train_model(model,
                train_data: GPT1Dataset,
                val_data: GPT1Dataset,
                learning_rate=0.01,
                batch_size=100,
                num_epochs=10,
                plot_every=50,
                plot=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)

    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=_collate_batch)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    iters, losses, train_mae, val_mae = [], [], [], []
    iter_count = 0

    for epoch in range(num_epochs):
        model.train()
        for input_ids, attention_mask, categorical_features, label in train_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            categorical_features = categorical_features.to(device)
            label = label.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, categorical_features)
            outputs = outputs.squeeze()
            loss = criterion(outputs, label.float())
            loss.backward()
            optimizer.step()

            if (iter_count + 1) % plot_every == 0:
                iters.append(iter_count)
                losses.append(float(loss))
                train_mae.append(accuracy(model, train_data))
                val_mae.append(accuracy(model, val_data))
                print(
                    f"Iter {iter_count + 1}: Loss: {losses[-1]} Train mae {train_mae[-1]}, Validation mae {val_mae[-1]}")
            iter_count += 1
            print(iter_count)

    if plot:
        plot_results(iters, losses, train_mae, val_mae)

In [33]:
train_model(model, train_data=train_dataset, val_data=train_dataset, batch_size=20)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
Iter 50: Loss: 2019149696.0 Train mae 74645.27841206246, Validation mae 74666.86270273436
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


KeyboardInterrupt: 