In [4]:
from utils import read_csv_data, clean_location

data = read_csv_data("../data/processed_job_postings_large_noNA.csv", 
                     ["industry", "work_type", "location", "formatted_experience_level",
                      "name", "cleaned_title", "cleaned_description", "title_emb"],
                     "salary_level")
data = clean_location(data, 2)
import random
random.seed(42)
random.shuffle(data)

train_data = data[:10000]
val_data = data[10000:13000]
test_data = data[13000:]

In [5]:
from utils import build_column_vocabulary

vocab_industry = build_column_vocabulary(train_data, 0)
vocab_type = build_column_vocabulary(train_data, 1)
vocab_state = build_column_vocabulary(train_data, 2)
vocab_level = build_column_vocabulary(train_data, 3)

In [6]:
import torch
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

targets = [data[i][1] for i in range(len(data))]

labels = ['10K-'] + [f"{i}K - {i + 10}K" for i in range(10, 150, 10)] + ['160K+']
label_to_int = {label: i for i, label in enumerate(labels)}

targets = list(map(lambda x: label_to_int[x], targets))

targets = torch.tensor(targets)

train_targets = targets[:10000]
val_targets = targets[10000:13000]
test_targets = targets[13000:]

In [7]:
torch.cuda.empty_cache()

In [8]:
from utils import convert_to_one_hot
import torch

train_cat_features = convert_to_one_hot(train_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

val_cat_features = convert_to_one_hot(val_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

test_cat_features = convert_to_one_hot(test_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convert Lists to Tensors
train_cat_features = torch.stack(train_cat_features).to(device)
val_cat_features = torch.stack(val_cat_features).to(device)
test_cat_features = torch.stack(test_cat_features).to(device)

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right" 
a = tokenizer('hello this is a test',
         truncation=True,
         padding='max_length',
         max_length=512,
         return_tensors="pt")

In [10]:
from gpt1 import GPT1Dataset

semantic_items = [item[0][4] + ' ' + item[0][5] + ' ' + item[0][6] for item in train_data]
input_ids_train, attention_mask_train = [], []
for description in semantic_items:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids_train.append(encoding['input_ids'][0])
    attention_mask_train.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids_train = torch.stack(input_ids_train).to(device)
attention_mask_train = torch.stack(attention_mask_train).to(device)

labels = [target for target in train_targets]

train_dataset = GPT1Dataset(input_ids_train, attention_mask_train, train_cat_features, labels)

In [11]:
semantic_items = [item[0][4] + ' ' + item[0][5] + ' ' + item[0][6] for item in val_data]
input_ids_val, attention_mask_val = [], []
for description in semantic_items:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids_val.append(encoding['input_ids'][0])
    attention_mask_val.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids_val = torch.stack(input_ids_val).to(device)
attention_mask_val = torch.stack(attention_mask_val).to(device)

labels = [target for target in val_targets]

val_dataset = GPT1Dataset(input_ids_val, attention_mask_val, val_cat_features, labels)

In [12]:
semantic_items = [item[0][4] + ' ' + item[0][5] + ' ' + item[0][6] for item in test_data]
input_ids_test, attention_mask_test = [], []
for description in semantic_items:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids_test.append(encoding['input_ids'][0])
    attention_mask_test.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids_test = torch.stack(input_ids_test).to(device)
attention_mask_test = torch.stack(attention_mask_test).to(device)

labels = [target for target in test_targets]

test_dataset = GPT1Dataset(input_ids_test, attention_mask_test, test_cat_features, labels)

In [15]:
from gpt1 import GPT1, train_classifier

labels = ['10K-'] + [f"{i}K - {i + 10}K" for i in range(10, 150, 10)] + ['160K+']

model = GPT1(len(vocab_type) + len(vocab_industry) + len(vocab_state) + len(vocab_level), output_size=len(labels), hidden_size=1000)
model.to(device)

# Note that the accuracy is calculated only on 1500 samples from both train and validation sets, not the entire dataset
# There are test on entire train/val/test set later
train_classifier(model, train_data=train_dataset, val_data=val_dataset, learning_rate=0.01, batch_size=20, num_epochs=45)

Iter 50: Loss: 2.6228525638580322 Train Acc: 0.12866666666666668, Validation Acc: 0.146
Iter 100: Loss: 2.6336631774902344 Train Acc: 0.136, Validation Acc: 0.16266666666666665
Iter 150: Loss: 2.746643543243408 Train Acc: 0.13866666666666666, Validation Acc: 0.156
Iter 200: Loss: 2.5612542629241943 Train Acc: 0.13933333333333334, Validation Acc: 0.158
Iter 250: Loss: 2.3961730003356934 Train Acc: 0.14066666666666666, Validation Acc: 0.15866666666666668
Iter 300: Loss: 2.7208762168884277 Train Acc: 0.16666666666666666, Validation Acc: 0.17133333333333334
Iter 350: Loss: 2.548516035079956 Train Acc: 0.182, Validation Acc: 0.17333333333333334
Iter 400: Loss: 2.681497097015381 Train Acc: 0.13866666666666666, Validation Acc: 0.158
Iter 450: Loss: 2.5745849609375 Train Acc: 0.16266666666666665, Validation Acc: 0.18333333333333332
Iter 500: Loss: 2.702854633331299 Train Acc: 0.19733333333333333, Validation Acc: 0.18733333333333332
Iter 550: Loss: 2.5043535232543945 Train Acc: 0.20733333333333

KeyboardInterrupt: 

In [28]:
# Copied code to calculate accuracy
from torch.utils.data import DataLoader

def _collate_batch(batch):
    """Custom collate function for handling batches of data where all input tensors are of the same length."""

    # Separate and stack the data directly since all tensors are already of the same length
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    categorical_features = torch.stack([item['categorical_features'] for item in batch]).float()
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.float).to(device)

    return input_ids, attention_mask, categorical_features, labels

def calculate_accuracy(model, dataloader: DataLoader) -> float:
    """
    Calculate the accuracy for a model over a given dataloader.

    Args:
        model: The model to evaluate.
        dataloader (DataLoader): The DataLoader containing the dataset.

    Returns:
        float: The accuracy of the model.
    """
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for input_ids, attention_mask, categorical_features, labels in dataloader:
            outputs = model(input_ids, attention_mask, categorical_features)
            outputs = outputs.squeeze()  # Adjust shape if necessary

            predictions = torch.argmax(outputs, dim=1)
            total_correct += torch.sum(predictions == labels).item()
            total_count += labels.size(0)
        return total_correct / total_count

In [29]:
# Accuracy of Base Bert Model without title_emb
train_loader = DataLoader(train_dataset, batch_size=25, shuffle=True, collate_fn=_collate_batch)
print(f"Model train accuracy: {calculate_accuracy(model, train_loader)}")
val_loader = DataLoader(val_dataset, batch_size=25, shuffle=True, collate_fn=_collate_batch)
print(f"Model val accuracy: {calculate_accuracy(model, val_loader)}")
test_loader = DataLoader(test_dataset, batch_size=25, shuffle=True, collate_fn=_collate_batch)
print(f"Model test accuracy: {calculate_accuracy(model, test_loader)}")

Model train accuracy: 0.9809
Model val accuracy: 0.677
Model test accuracy: 0.6742671009771987
