In [1]:
from utils import read_csv_data, clean_location

data = read_csv_data("../data/processed_job_postings_large.csv", 
                     ["industry", "work_type", "location", "formatted_experience_level",
                      "name", "cleaned_title", "cleaned_description"],
                     "standardized_annual_salary")
data = clean_location(data, 2)
import random
random.seed(42)
random.shuffle(data)

train_data = data[:20000]
val_data = data[20000:30000]
test_data = data[30000:]

In [2]:
from utils import build_column_vocabulary

vocab_industry = build_column_vocabulary(train_data, 0)
vocab_type = build_column_vocabulary(train_data, 1)
vocab_state = build_column_vocabulary(train_data, 2)
vocab_level = build_column_vocabulary(train_data, 3)

In [3]:
from utils import convert_to_one_hot
import torch

train_cat_features = convert_to_one_hot(train_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

val_cat_features = convert_to_one_hot(val_data, 
                                  [(0, vocab_industry),
                                   (1, vocab_type),
                                   (2, vocab_state),
                                   (3, vocab_level)])

# Convert Lists to Tensors
train_cat_features = torch.stack(train_cat_features)
val_cat_features = torch.stack(val_cat_features)

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("openai-gpt")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right" 
a = tokenizer('hello this is a test',
         truncation=True,
         padding='max_length',
         max_length=512,
         return_tensors="pt")

In [5]:

from gpt1 import GPT1Dataset
from torch.utils.data import Dataset

descriptions = [item[0][6] for item in train_data]
input_ids, attention_mask = [], []
for description in descriptions:
    encoding = tokenizer(description,
                         truncation=True,
                         padding='max_length',
                         max_length=512,
                         return_tensors="pt")
    # input_ids.append(encoding['input_ids'].squeeze())
    # attention_mask.append(encoding['attention_mask'].squeeze())
    input_ids.append(encoding['input_ids'][0])
    attention_mask.append(encoding['attention_mask'][0])

# Convert Lists to Tensors
input_ids = torch.stack(input_ids)
attention_mask = torch.stack(attention_mask)    

labels = [float(target) for _, target in train_data]

train_dataset = GPT1Dataset(input_ids, attention_mask, train_cat_features, labels)

In [6]:
train_dataset[0]

{'input_ids': tensor([ 6303,  5253, 23263,   640,  5154,   562,  4387,   488, 33243, 12810,
           485,  8052,   600,   640,  7826,   500,   246, 32968,  6425,   488,
          2029, 14121,  1996,  7404,   481,  5253,  6844,   485,  2236,   488,
         12561,   746,  5253,  9535,  6672,  8153,  2694,  8017,  6112,   488,
          1178, 40443, 20369,  5655,   481,  2179,  9269,  6875,  2906,   555,
          9514,  6875,   544,   566,   498,   481,  1495, 26789,  3304, 31375,
           500,   481,  6391,  7876,   600,   640,  1081,   562,   531,  5007,
          6303,  5253,  6844,   485,  3351,   754,  3170,  2855,   556,   531,
          6636,  6271, 39135,   745,   562,  4804, 10361,   488,  2429,   616,
          7391,   500, 35627,   562,   754,  6589,  1463,   488, 10998,  8426,
         18298, 19231,  3388,  1510,  1383, 12602, 14152, 28530,   488,  6190,
           519,  2102,  5142,  7730,   517,  1801, 22537,  5253, 11675, 16680,
           488,  3675,   485,  9945,   

In [14]:
# copied from https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing
# Activate 4-bit precision base model loading
# use_4bit = True
use_4bit = False

# Compute dtype for 4-bit base models
# bnb_4bit_compute_dtype = "float16"
bnb_4bit_compute_dtype = "float32"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

from peft import LoraConfig
from transformers import TrainingArguments

# 
# Load tokenizer and model with QLoRA configuration
# compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
# 
# Check GPU compatibility with bfloat16
# if compute_dtype == torch.float16 and use_4bit:
#     major, _ = torch.cuda.get_device_capability()
#     if major >= 8:
#         print("=" * 80)
#         print("Your GPU supports bfloat16: accelerate training with bf16=True")
#         print("=" * 80)

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["c_attn", "c_proj"],
    fan_in_fan_out=True
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to=["tensorboard"],
    no_cuda=True
)



In [15]:
from gpt1 import GPT1

model = GPT1(len(vocab_type) + len(vocab_industry) + len(vocab_state) + len(vocab_level))

In [9]:
print(type(input_ids), input_ids.shape)
print(type(attention_mask), attention_mask.shape)
print(type(train_cat_features), train_cat_features.shape)

<class 'torch.Tensor'> torch.Size([20000, 512])
<class 'torch.Tensor'> torch.Size([20000, 512])
<class 'torch.Tensor'> torch.Size([20000, 307])


In [19]:
from torch.utils.data import DataLoader
from gpt1 import collate_batch

data_loader = DataLoader(train_dataset, batch_size=4, collate_fn=collate_batch, shuffle=True)

# Test run through the DataLoader to print and check batch contents
for idx, batch in enumerate(data_loader):
    if 'labels' not in batch:
        print(f"Error: 'labels' key missing in batch {idx}")
    else:
        print(f"Batch {idx}: Contains 'labels' with shape {batch['labels'].shape}")
    if idx == 5:  # Check only the first 3 batches
        break

Error: 'labels' key missing in batch 0
Error: 'labels' key missing in batch 1
Error: 'labels' key missing in batch 2
Error: 'labels' key missing in batch 3
Error: 'labels' key missing in batch 4
Error: 'labels' key missing in batch 5


In [20]:
from gpt1 import collate_batch
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    data_collator=collate_batch,
)


trainer.train()

  0%|          | 0/5000 [00:00<?, ?it/s]

AttributeError: 'NoneType' object has no attribute 'cget_managed_ptr'