In [2]:
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

random_seed = 123
random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)


In [3]:
torch.cuda.is_available()

True

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [5]:
from utils import read_csv_data, clean_location, convert_to_one_hot, build_column_vocabulary
import pandas as pd

In [6]:
data = pd.read_csv("../data/processed_job_postings.csv")

data['company_id'] = data['company_id'].fillna(-1.0).astype(int)
data['formatted_experience_level'] = data['formatted_experience_level'].fillna("Not Specified")
data['industry'] = data['industry'].fillna("Not Specified")
data['name'] = data['name'].fillna("Not Specified")
data['cleaned_description'] = data['cleaned_description'].fillna("")

data[:10]

Unnamed: 0,company_id,name,industry,cleaned_title,cleaned_description,work_type,location,formatted_experience_level,standardized_annual_salary,salary_level,title_emb
0,553718,HearingLife,Retail,hearing care provider,overview HearingLife national hearing care com...,FULL_TIME,"Little River, SC",Entry level,63000.0,65K - 70K,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
1,18213359,Episcopal Communities & Services,Non-profit Organization Management,cook,descriptiontitle look great opportunity develo...,FULL_TIME,"Aliso Viejo, CA",Entry level,42758.4,45K - 50K,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
2,18213359,Episcopal Communities & Services,Non-profit Organizations,cook,descriptiontitle look great opportunity develo...,FULL_TIME,"Aliso Viejo, CA",Entry level,42758.4,45K - 50K,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
3,437225,"iHerb, LLC",Retail,principal cloud security architect remote,Job Summary iHerb mission health wellness acce...,FULL_TIME,United States,Mid-Senior level,240895.0,150K+,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
4,18213359,Episcopal Communities & Services,Non-profit Organization Management,dishwasher,descriptiontitle sign Bonus guarantee look foo...,FULL_TIME,"Aliso Viejo, CA",Entry level,37056.0,40K - 45K,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
5,18213359,Episcopal Communities & Services,Non-profit Organizations,dishwasher,descriptiontitle sign Bonus guarantee look foo...,FULL_TIME,"Aliso Viejo, CA",Entry level,37056.0,40K - 45K,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
6,19181907,Escalent,Market Research,insights analyst auto industry,Escalent award win data analytic advisory firm...,FULL_TIME,United States,Entry level,61000.0,65K - 70K,[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. ...
7,73013724,J. Galt,Financial Services,sales manager,Position Summary Sales Manager manage partnerr...,FULL_TIME,"Coeur d'Alene, ID",Mid-Senior level,237500.0,150K+,[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
8,18213359,Episcopal Communities & Services,Non-profit Organization Management,custodian janitor,descriptiontitle come work Custodian great com...,FULL_TIME,"Altadena, CA",Entry level,37056.0,40K - 45K,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
9,18213359,Episcopal Communities & Services,Non-profit Organizations,custodian janitor,descriptiontitle come work Custodian great com...,FULL_TIME,"Altadena, CA",Entry level,37056.0,40K - 45K,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...


In [7]:
descriptions = data['cleaned_description'].to_numpy()

encodings = tokenizer.batch_encode_plus(
    descriptions,
    padding=True,              # Pad to the maximum sequence length
    truncation=True,           # Truncate to the maximum sequence length if necessary
    return_tensors='pt',      # Return PyTorch tensors
    add_special_tokens=True,    # Add special tokens CLS and SEP
)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

input_ids = encodings['input_ids']
input_ids = input_ids.to(device)

attention_mask = encodings['attention_mask']
attention_mask = attention_mask.to(device)

input_ids[:5].shape

torch.Size([5, 512])

In [24]:
batch_size = 512

description_mean_embs = []
description_cls_embs = []
with torch.no_grad():
    for i in range(0, len(input_ids), batch_size):

        outputs = model(input_ids[i:i+batch_size], attention_mask=attention_mask[i:i+batch_size])
        word_embeddings = outputs.last_hidden_state

        description_mean_embs.append(word_embeddings.mean(dim=1).cpu())
        description_cls_embs.append(word_embeddings[:, 0, :].cpu())

        torch.cuda.empty_cache()

        print(i)

0
512
1024
1536
2048
2560
3072
3584
4096
4608
5120
5632
6144
6656
7168
7680
8192
8704
9216
9728
10240
10752
11264
11776
12288
12800
13312
13824
14336
14848
15360
15872
16384
16896
17408
17920
18432


In [25]:
description_mean_embs_tensor = torch.cat(description_mean_embs, dim=0)
description_cls_embs_tensor = torch.cat(description_cls_embs, dim=0)

In [29]:
data['description_mean_emb'] = description_mean_embs_tensor.tolist()
data['description_cls_emb'] = description_cls_embs_tensor.tolist()

In [31]:
data.to_csv("../data/processed_job_postings_with_embeddings.csv", index=False)

In [59]:
input_cols = ['name', 'industry', 'cleaned_title', 'cleaned_description', 'work_type', 'location', 'formatted_experience_level', 'title_emb']
target_col = 'standardized_annual_salary'

organized_data = [(row[input_cols], row[target_col]) for _, row in data.iterrows()]
organized_data[:10]

[(name                                                                HearingLife
  industry                                                                 Retail
  cleaned_title                                             hearing care provider
  cleaned_description           overview HearingLife national hearing care com...
  work_type                                                             FULL_TIME
  location                                                       Little River, SC
  formatted_experience_level                                          Entry level
  title_emb                     [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
  Name: 0, dtype: object,
  63000.0),
 (name                                           Episcopal Communities & Services
  industry                                     Non-profit Organization Management
  cleaned_title                                                              cook
  cleaned_description           descriptiontitle look great 

In [62]:
type(organized_data[0][0])

pandas.core.series.Series