In [1]:
from utils import read_csv_data, clean_location

data = read_csv_data("../data/processed_job_postings_large_noNA.csv", 
                     ["industry", "work_type", "location", "formatted_experience_level",
                      "name", "cleaned_title", "cleaned_description"],
                     "standardized_annual_salary")
data = clean_location(data, 2)
import random
random.seed(42)
random.shuffle(data)

train_data = data[:100]
val_data = data[100:200]
test_data = data[300:400]

In [2]:
from utils import build_column_vocabulary

vocab_industry = build_column_vocabulary(train_data, 0)
vocab_type = build_column_vocabulary(train_data, 1)
vocab_state = build_column_vocabulary(train_data, 2)
vocab_level = build_column_vocabulary(train_data, 3)

In [3]:
from utils import convert_to_one_hot
import torch

train_cat_features = convert_to_one_hot(train_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

val_cat_features = convert_to_one_hot(val_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

test_cat_features = convert_to_one_hot(test_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convert Lists to Tensors
train_cat_features = torch.stack(train_cat_features).to(device)
val_cat_features = torch.stack(val_cat_features).to(device)
test_cat_features = torch.stack(test_cat_features).to(device)

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right" 
a = tokenizer('hello this is a test',
         truncation=True,
         padding='max_length',
         max_length=512,
         return_tensors="pt")

In [5]:

from gpt1 import GPT1Dataset

semantic_items = [item[0][4] + ' ' + item[0][5] + ' ' + item[0][6] for item in train_data]
input_ids, attention_mask = [], []
for description in semantic_items:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids).to(device)
attention_mask = torch.stack(attention_mask).to(device)

labels = [float(target) for _, target in train_data]

train_dataset = GPT1Dataset(input_ids, attention_mask, train_cat_features, labels)

In [6]:
semantic_items = [item[0][4] + ' ' + item[0][5] + ' ' + item[0][6] for item in val_data]
input_ids, attention_mask = [], []
for description in semantic_items:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids).to(device)
attention_mask = torch.stack(attention_mask).to(device)

labels = [float(target) for _, target in val_data]

val_dataset = GPT1Dataset(input_ids, attention_mask, val_cat_features, labels)

In [7]:
semantic_items = [item[0][4] + ' ' + item[0][5] + ' ' + item[0][6] for item in test_data]
input_ids, attention_mask = [], []
for description in semantic_items:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids).to(device)
attention_mask = torch.stack(attention_mask).to(device)

labels = [float(target) for _, target in test_data]

test_dataset = GPT1Dataset(input_ids, attention_mask, test_cat_features, labels)

In [8]:
test_dataset[0]

{'input_ids': tensor([  101,  1048, 23644,  2449,  4106,  8325,  3795, 20138,  4270,  4132,
          2055,  2256,  7396,  2256,  7396,  2003,  1037,  2877,  3361, 14262,
         25929, 20138,  4270, 10346, 15007,  2194,  9403,  1999,  7211,  2859,
         17235,  2850,  4160,  3205,  2009,  8704,  2000,  3073,  1996,  2087,
          3722,  5851,  1998, 14057,  5211,  6322,  1999,  1996,  3795,  3361,
          3006,  2007,  3935,  2578,  1998,  3688,  2049,  4138,  1998,  7578,
          5661,  2421,  4800, 10521,  6895, 24759,  2937,  2854, 11817,  7781,
          8430,  4093,  1998,  9968,  4128,  4385,  2023,  3105,  4107,  1037,
          3811,  4012, 22327,  4183,  3726,  9430,  7427,  1998,  7721,  3167,
          2458,  2306,  1996,  2194,  1998,  3458,  1996,  4132,  1999,  1996,
          2925,  2476, 10425,  2055,  1996,  2535, 11532,  1999,  1996, 10346,
         15007,  3068,  1996, 10673,  2000,  2147,  4876,  2007, 16428,  1998,
          3169,  2780,  2024,  3223,  3

In [9]:
from bert import Bert
from gpt1 import train_model
"""remember to use gpu i.e. model.to(device) device = 'cuda' """

model = Bert(len(vocab_type) + len(vocab_industry) + len(vocab_state) + len(vocab_level))
model.to(device)

train_model(model, train_data=train_dataset, val_data=val_dataset, batch_size=15, eval_batch_size=15, num_epochs=500)

Iter 50: Loss: 5676442624.0 Train mae 63569.25125, Validation mae 63248.525
Iter 100: Loss: 2020960128.0 Train mae 34227.25, Validation mae 36363.7453125
Iter 150: Loss: 1107472128.0 Train mae 34081.45, Validation mae 36025.614375
Iter 200: Loss: 1219955328.0 Train mae 33822.9365625, Validation mae 36415.7434375
Iter 250: Loss: 1035797056.0 Train mae 33970.321875, Validation mae 36821.1059375
Iter 300: Loss: 1199235456.0 Train mae 33958.3653125, Validation mae 37485.5965625
Iter 350: Loss: 2171264768.0 Train mae 34122.606875, Validation mae 36125.6946875
Iter 400: Loss: 2090017920.0 Train mae 33099.1, Validation mae 37171.0196875
Iter 450: Loss: 1342404864.0 Train mae 34406.30625, Validation mae 36667.5546875
Iter 500: Loss: 1372660864.0 Train mae 33912.8896875, Validation mae 37077.42375
Iter 550: Loss: 1697932160.0 Train mae 34334.5328125, Validation mae 36716.478125
Iter 600: Loss: 2578798592.0 Train mae 34469.3675, Validation mae 36812.275625
Iter 650: Loss: 2539887872.0 Train mae 

KeyboardInterrupt: 

In [11]:
torch.cuda.empty_cache()

In [10]:
import torch.nn as nn
from transformers import OpenAIGPTModel
class GPT1_3LL(nn.Module):
    def __init__(self, num_categorical_features):
        super(GPT1_3LL, self).__init__()

        self.gpt = OpenAIGPTModel.from_pretrained("openai-gpt")

        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(self.gpt.config.hidden_size + num_categorical_features, 500)
        self.fc2 = nn.Linear(500, 250)
        self.fc3 = nn.Linear(250, 100)
        self.dropout = nn.Dropout(0.1)
        self.output = nn.Linear(100, 1)  # Output layer for salary prediction

    def forward(self, input_ids, attention_mask, categorical_features):
        # Process textual input through GPT
        outputs = self.gpt(input_ids=input_ids, attention_mask=attention_mask)

        text_features = outputs.last_hidden_state[:, -1, :]  # Use the last token's representation

        # Concatenate text features with categorical features
        combined_features = torch.cat((text_features, categorical_features), dim=1)
        combined_features = self.fc1(combined_features)
        x = self.relu(combined_features)

        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)

        x = self.dropout(x)
        x = self.output(x)

        return x

In [11]:
model2 = GPT1_3LL(len(vocab_type) + len(vocab_industry) + len(vocab_state) + len(vocab_level))
model2.to(device)

train_model(model2, train_data=train_dataset, val_data=val_dataset, batch_size=20, eval_batch_size=20)

Iter 50: Loss: 2304969216.0 Train mae 43995.17275, Validation mae 133732.32808333333
Iter 100: Loss: 2342991872.0 Train mae 40187.832708333335, Validation mae 129563.01847916667
Iter 150: Loss: 2517573632.0 Train mae 41184.2993125, Validation mae 131211.11129166666
Iter 200: Loss: 1892202112.0 Train mae 46290.084375, Validation mae 136913.785375
Iter 250: Loss: 9334715392.0 Train mae 83573.45725, Validation mae 173041.46491666668
Iter 300: Loss: 6555491840.0 Train mae 61665.673625, Validation mae 151432.96091666666
Iter 350: Loss: 621434112.0 Train mae 39524.65902083334, Validation mae 129395.229
Iter 400: Loss: 2730205952.0 Train mae 40443.1231875, Validation mae 130230.65254166667
Iter 450: Loss: 1720825088.0 Train mae 39561.436604166665, Validation mae 129201.3484375
Iter 500: Loss: 923200960.0 Train mae 39539.576166666666, Validation mae 129019.21333333333
Iter 550: Loss: 4472351232.0 Train mae 41313.34720833333, Validation mae 131163.35727083334
Iter 600: Loss: 7872652800.0 Train 

KeyboardInterrupt: 

In [34]:
from gpt1 import _collate_batch
from torch.utils.data import DataLoader

def calculate_mae(model, dataloader: DataLoader) -> float:
    """
    Calculate the mean absolute error for a model over a given dataloader.

    Args:
        model: The model to evaluate.
        dataloader (DataLoader): The DataLoader containing the dataset.

    Returns:
        float: The mean absolute error of the model.
    """
    total_distance = 0
    total_count = 0

    with torch.no_grad():
        for input_ids, attention_mask, categorical_features, labels in dataloader:
            outputs = model(input_ids, attention_mask, categorical_features)
            outputs = outputs.squeeze()  # Adjust shape if necessary
            print(outputs)
            print(labels)
            distances = torch.abs(labels - outputs)
            total_distance += distances.sum().item()
            total_count += labels.size(0)

    return total_distance / total_count

val_loader = DataLoader(val_dataset, batch_size=20, shuffle=False, collate_fn=_collate_batch)
print(calculate_mae(model2, val_loader))

tensor([97139.6328, 86409.7344, 86849.2969, 71328.0781, 86418.7422, 97598.4609,
        81272.2188, 91716.6562, 75474.0938, 76605.3047, 89349.3281, 82202.1172,
        73608.5000, 94524.7500, 90060.6250, 79302.4531, 67357.6250, 85457.1875,
        83388.2969, 85555.2500], device='cuda:0')
tensor([ 43200.0000,  48000.0000,  32640.0000,  48000.0000,  65416.0000,
        173500.0000,  32640.0000, 121641.6016,  30620.0000,  76800.0000,
         43200.0000, 115000.0000,  66500.0000,  87500.0000, 121450.0000,
         92500.0000, 100000.0000, 330000.0000, 171500.0000, 134400.0000],
       device='cuda:0')
tensor([91454.4531, 82358.5625, 75229.6562, 76788.9844, 80202.4531, 85604.2812,
        92095.2188, 90381.7578, 89064.9766, 89701.7969, 87062.5312, 87415.0000,
        84688.6250, 90101.9688, 95574.8281, 83637.7734, 90904.0000, 86353.2188,
        91553.5781, 80359.5391], device='cuda:0')
tensor([ 58000.0000, 212500.0000,  74730.0000,  80390.5000,  64704.0000,
        175000.0000,  66528.00

KeyboardInterrupt: 

In [32]:
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=False, collate_fn=_collate_batch)
print(calculate_mae(model2, train_loader))

73331.4451921875


In [33]:
test_loader = DataLoader(test_dataset, batch_size=20, shuffle=False, collate_fn=_collate_batch)
print(calculate_mae(model2, test_loader))

38531.93207586786
