In [37]:
from utils import read_csv_data, clean_location

data = read_csv_data("../data/processed_job_postings_large.csv", 
                     ["industry", "work_type", "location", "formatted_experience_level",
                      "name", "cleaned_title", "cleaned_description"],
                     "standardized_annual_salary")
data = clean_location(data, 2)
import random
random.seed(42)
random.shuffle(data)

train_data = data[:100]
val_data = data[100:200]
test_data = data[30000:]

In [38]:
from utils import build_column_vocabulary

vocab_industry = build_column_vocabulary(train_data, 0)
vocab_type = build_column_vocabulary(train_data, 1)
vocab_state = build_column_vocabulary(train_data, 2)
vocab_level = build_column_vocabulary(train_data, 3)

In [39]:
from utils import convert_to_one_hot
import torch

train_cat_features = convert_to_one_hot(train_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

val_cat_features = convert_to_one_hot(val_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

test_cat_features = convert_to_one_hot(test_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convert Lists to Tensors
train_cat_features = torch.stack(train_cat_features).to(device)
val_cat_features = torch.stack(val_cat_features).to(device)
test_cat_features = torch.stack(test_cat_features).to(device)

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right" 
a = tokenizer('hello this is a test',
         truncation=True,
         padding='max_length',
         max_length=512,
         return_tensors="pt")

In [41]:

from gpt1 import GPT1Dataset

semantic_items = [item[0][4] + ' ' + item[0][5] + ' ' + item[0][6] for item in train_data]
input_ids, attention_mask = [], []
for description in semantic_items:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids).to(device)
attention_mask = torch.stack(attention_mask).to(device)

labels = [float(target) for _, target in train_data]

train_dataset = GPT1Dataset(input_ids, attention_mask, train_cat_features, labels)

In [42]:
semantic_items = [item[0][4] + ' ' + item[0][5] + ' ' + item[0][6] for item in val_data]
input_ids, attention_mask = [], []
for description in semantic_items:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids).to(device)
attention_mask = torch.stack(attention_mask).to(device)

labels = [float(target) for _, target in val_data]

val_dataset = GPT1Dataset(input_ids, attention_mask, val_cat_features, labels)

In [43]:
semantic_items = [item[0][4] + ' ' + item[0][5] + ' ' + item[0][6] for item in test_data]
input_ids, attention_mask = [], []
for description in semantic_items:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids).to(device)
attention_mask = torch.stack(attention_mask).to(device)

labels = [float(target) for _, target in test_data]

test_dataset = GPT1Dataset(input_ids, attention_mask, test_cat_features, labels)

In [44]:
test_dataset[0]

{'input_ids': tensor([ 1035,   268,  7471, 22429, 17397,  2192,  9189,  1890,  7613, 17397,
          2192,   960,   495,  5230,  6123, 30880,  1874,  3594,   500,  2759,
          1801,  1755,   896,   541, 39955,   720,  6307,   485,  5513,   292,
          1048,   485,   284,    24,    20,  3594,   498,  7049, 26459,   485,
           750,    20,  2596,  6144, 13244, 27277, 14814,  1490,  2088, 18773,
         22136, 22720, 19231,  3388,   641,  7127,   531, 17397,  2192,  9189,
           485,  8052,   481,  3866,   488, 12649, 10940,   498,   622, 17397,
          5480,   616,  6118, 18993,   246,  7087,   498, 22754,   488, 20860,
         12993,  9232,   504,  7739,   649,  3669, 10760,  3987,   488, 10051,
           481, 17397,  2192,  9189,  9862,   246, 10875,  6118,   500, 15306,
         29599,   556, 22746,   488,  7174, 11866,  4640, 17397, 16015,   488,
         37973,   622,  9425, 39375,   488,  3186, 14447,  4101, 14234,   488,
         16370,  1801, 12289,   488, 18

In [47]:
from gpt1 import GPT1
from gpt1 import train_model
"""remember to use gpu i.e. model.to(device) device = 'cuda' """

model = GPT1(len(vocab_type) + len(vocab_industry) + len(vocab_state) + len(vocab_level))
model.to(device)

train_model(model, train_data=train_dataset, val_data=val_dataset, batch_size=20, eval_batch_size=20, num_epochs=500)


Iter 50: Loss: 8868547584.0 Train mae 81166.2375, Validation mae 85536.165
Iter 100: Loss: 6232518144.0 Train mae 45299.144375, Validation mae 50172.601875
Iter 150: Loss: 1249763200.0 Train mae 35345.2175, Validation mae 44598.883125
Iter 200: Loss: 1925807744.0 Train mae 36404.234375, Validation mae 45565.256875


KeyboardInterrupt: 

In [36]:
torch.cuda.empty_cache()

In [10]:
import torch.nn as nn
from transformers import OpenAIGPTModel
class GPT1_3LL(nn.Module):
    def __init__(self, num_categorical_features):
        super(GPT1_3LL, self).__init__()

        self.gpt = OpenAIGPTModel.from_pretrained("openai-gpt")

        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(self.gpt.config.hidden_size + num_categorical_features, 500)
        self.fc2 = nn.Linear(500, 250)
        self.fc3 = nn.Linear(250, 100)
        self.dropout = nn.Dropout(0.1)
        self.output = nn.Linear(100, 1)  # Output layer for salary prediction

    def forward(self, input_ids, attention_mask, categorical_features):
        # Process textual input through GPT
        outputs = self.gpt(input_ids=input_ids, attention_mask=attention_mask)

        text_features = outputs.last_hidden_state[:, -1, :]  # Use the last token's representation

        # Concatenate text features with categorical features
        combined_features = torch.cat((text_features, categorical_features), dim=1)
        combined_features = self.fc1(combined_features)
        x = self.relu(combined_features)

        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.relu(x)

        x = self.dropout(x)
        x = self.output(x)

        return x

In [11]:
model2 = GPT1_3LL(len(vocab_type) + len(vocab_industry) + len(vocab_state) + len(vocab_level))
model2.to(device)

train_model(model2, train_data=train_dataset, val_data=val_dataset, batch_size=20, eval_batch_size=20)

Iter 50: Loss: 2304969216.0 Train mae 43995.17275, Validation mae 133732.32808333333
Iter 100: Loss: 2342991872.0 Train mae 40187.832708333335, Validation mae 129563.01847916667
Iter 150: Loss: 2517573632.0 Train mae 41184.2993125, Validation mae 131211.11129166666
Iter 200: Loss: 1892202112.0 Train mae 46290.084375, Validation mae 136913.785375
Iter 250: Loss: 9334715392.0 Train mae 83573.45725, Validation mae 173041.46491666668
Iter 300: Loss: 6555491840.0 Train mae 61665.673625, Validation mae 151432.96091666666
Iter 350: Loss: 621434112.0 Train mae 39524.65902083334, Validation mae 129395.229
Iter 400: Loss: 2730205952.0 Train mae 40443.1231875, Validation mae 130230.65254166667
Iter 450: Loss: 1720825088.0 Train mae 39561.436604166665, Validation mae 129201.3484375
Iter 500: Loss: 923200960.0 Train mae 39539.576166666666, Validation mae 129019.21333333333
Iter 550: Loss: 4472351232.0 Train mae 41313.34720833333, Validation mae 131163.35727083334
Iter 600: Loss: 7872652800.0 Train 

KeyboardInterrupt: 

In [35]:
from gpt1 import _collate_batch
from torch.utils.data import DataLoader

def calculate_mae(model, dataloader: DataLoader) -> float:
    """
    Calculate the mean absolute error for a model over a given dataloader.

    Args:
        model: The model to evaluate.
        dataloader (DataLoader): The DataLoader containing the dataset.

    Returns:
        float: The mean absolute error of the model.
    """
    total_distance = 0
    total_count = 0

    with torch.no_grad():
        for input_ids, attention_mask, categorical_features, labels in dataloader:
            outputs = model(input_ids, attention_mask, categorical_features)
            outputs = outputs.squeeze()  # Adjust shape if necessary
            distances = torch.abs(labels - outputs)
            total_distance += distances.sum().item()
            total_count += labels.size(0)

    return total_distance / total_count

val_loader = DataLoader(val_dataset, batch_size=20, shuffle=False, collate_fn=_collate_batch)
print(calculate_mae(model2, val_loader))

86231.99638125


In [32]:
train_loader = DataLoader(train_dataset, batch_size=20, shuffle=False, collate_fn=_collate_batch)
print(calculate_mae(model2, train_loader))

73331.4451921875


In [33]:
test_loader = DataLoader(test_dataset, batch_size=20, shuffle=False, collate_fn=_collate_batch)
print(calculate_mae(model2, test_loader))

38531.93207586786
