In [1]:
from utils import read_csv_data, clean_location

data = read_csv_data("../data/processed_job_postings_large.csv", 
                     ["industry", "work_type", "location", "formatted_experience_level",
                      "name", "cleaned_title", "cleaned_description"],
                     "standardized_annual_salary")
data = clean_location(data, 2)
import random
random.seed(42)
random.shuffle(data)

train_data = data[:1000]
val_data = data[1000:2000]
test_data = data[30000:]

In [2]:
from utils import build_column_vocabulary

vocab_industry = build_column_vocabulary(train_data, 0)
vocab_type = build_column_vocabulary(train_data, 1)
vocab_state = build_column_vocabulary(train_data, 2)
vocab_level = build_column_vocabulary(train_data, 3)

In [3]:
from utils import convert_to_one_hot
import torch

train_cat_features = convert_to_one_hot(train_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

val_cat_features = convert_to_one_hot(val_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

# Convert Lists to Tensors
train_cat_features = torch.stack(train_cat_features)
val_cat_features = torch.stack(val_cat_features)

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right" 
a = tokenizer('hello this is a test',
         truncation=True,
         padding='max_length',
         max_length=512,
         return_tensors="pt")

In [5]:

from gpt1 import GPT1Dataset

descriptions = [item[0][6] for item in train_data]
input_ids, attention_mask = [], []
for description in descriptions:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

labels = [float(target) for _, target in train_data]

train_dataset = GPT1Dataset(input_ids, attention_mask, train_cat_features, labels)

In [6]:
descriptions = [item[0][6] for item in val_data]
input_ids, attention_mask = [], []
for description in descriptions:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    input_ids.append(encoding['input_ids'].squeeze())
    attention_mask.append(encoding['attention_mask'].squeeze())
    # input_ids.append(encoding['input_ids'][0])
    # attention_mask.append(encoding['attention_mask'][0])

labels = [float(target) for _, target in val_data]

val_dataset = GPT1Dataset(input_ids, attention_mask, val_cat_features, labels)

In [7]:
train_dataset[0]

{'input_ids': tensor([ 6303,  5253, 23263,   640,  5154,   562,  4387,   488, 33243, 12810,
           485,  8052,   600,   640,  7826,   500,   246, 32968,  6425,   488,
          2029, 14121,  1996,  7404,   481,  5253,  6844,   485,  2236,   488,
         12561,   746,  5253,  9535,  6672,  8153,  2694,  8017,  6112,   488,
          1178, 40443, 20369,  5655,   481,  2179,  9269,  6875,  2906,   555,
          9514,  6875,   544,   566,   498,   481,  1495, 26789,  3304, 31375,
           500,   481,  6391,  7876,   600,   640,  1081,   562,   531,  5007,
          6303,  5253,  6844,   485,  3351,   754,  3170,  2855,   556,   531,
          6636,  6271, 39135,   745,   562,  4804, 10361,   488,  2429,   616,
          7391,   500, 35627,   562,   754,  6589,  1463,   488, 10998,  8426,
         18298, 19231,  3388,  1510,  1383, 12602, 14152, 28530,   488,  6190,
           519,  2102,  5142,  7730,   517,  1801, 22537,  5253, 11675, 16680,
           488,  3675,   485,  9945,   

In [8]:
from gpt1 import GPT1

model = GPT1(len(vocab_type) + len(vocab_industry) + len(vocab_state) + len(vocab_level))

In [None]:
from src.models.gpt1 import train_model

train_model(model, train_data=train_dataset, val_data=val_data)