In [6]:
import numpy as np
import pandas as pd

# Load data
# Load the dataset, selecting only the specified columns
data = pd.read_csv("../data/processed_job_postings_large.csv")

In [8]:
data.isna().sum()


company_id                    12658
name                            385
industry                      11949
cleaned_title                     0
cleaned_description               0
work_type                      7955
location                        208
formatted_experience_level    16923
standardized_annual_salary        0
salary_level                      0
title_emb                         0
dtype: int64

In [3]:
data.dropna(subset=['formatted_experience_level', 'work_type', 'industry'], inplace=True)
data.to_csv("../data/processed_job_postings_large_noNA.csv", index=False)

In [9]:
"""Label rows by salary interval"""  
bins = [0] + list(range(10000, 160000, 10000)) + [float('inf')]
labels = ['10K-'] + [f"{i}K - {i + 10}K" for i in range(10, 150, 10)] + ['160K+']

data['salary_level'] = pd.cut(data['standardized_annual_salary'], bins=bins, labels=labels, right=False)

In [11]:
data.to_csv("../data/processed_job_postings_large_noNA_labeled.csv", index=False)

In [None]:
from src.data_processing.data_processing import clean_description
import spacy

"""Clean descriptions"""  
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
data['cleaned_description'] = data['description'].apply(lambda x: clean_description(x, nlp))


In [None]:
from src.data_processing.data_processing import clean_title

"""Clean title"""  
data['cleaned_title'] = data['title'].apply(lambda x: clean_title(x, nlp))

In [None]:
from src.data_processing.data_processing import generate_vocabulary
from src.data_processing.data_processing import create_title_emb
 
"""Create embedding for title"""  
vocab = generate_vocabulary(data['cleaned_title'])
word_to_vec = {word: np.eye(len(vocab))[i] for i, word in enumerate(vocab)}
data['title_emb'] = data['cleaned_title'].apply(lambda x: create_title_emb(x, word_to_vec))

In [None]:
data['title_emb']

0        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                               ...                        
18452    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
18453    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
18454    [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
18455    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
18456    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: title_emb, Length: 18457, dtype: object

In [None]:

final_processed_data.to_csv("../data/processed_job_postings.csv", index=False)

In [26]:
final_processed_data[:15]

Unnamed: 0,company_id,name,industry,cleaned_title,cleaned_description,work_type,location,formatted_experience_level,standardized_annual_salary,salary_level,title_emb
0,553718.0,HearingLife,Retail,hearing care provider,overview HearingLife national hearing care com...,FULL_TIME,"Little River, SC",Entry level,63000.0,65K - 70K,
1,18213359.0,Episcopal Communities & Services,Non-profit Organization Management,cook,descriptiontitle look great opportunity develo...,FULL_TIME,"Aliso Viejo, CA",Entry level,42758.4,45K - 50K,
2,18213359.0,Episcopal Communities & Services,Non-profit Organizations,cook,descriptiontitle look great opportunity develo...,FULL_TIME,"Aliso Viejo, CA",Entry level,42758.4,45K - 50K,
3,437225.0,"iHerb, LLC",Retail,principal cloud security architect remote,Job Summary iHerb mission health wellness acce...,FULL_TIME,United States,Mid-Senior level,240895.0,150K+,
4,18213359.0,Episcopal Communities & Services,Non-profit Organization Management,dishwasher,descriptiontitle sign Bonus guarantee look foo...,FULL_TIME,"Aliso Viejo, CA",Entry level,37056.0,40K - 45K,
...,...,...,...,...,...,...,...,...,...,...,...
18452,,,,mental health practitioners,Gail Yost Associates hire time license mental ...,FULL_TIME,"Minneapolis, MN",,100000.0,100K - 105K,
18453,61469.0,United Staffing Solutions (USS),Staffing & Recruiting,registered nurse rn vaccinator,United Staffing Solutions partner big covid va...,PART_TIME,"Muskegon, MI",,96000.0,100K - 105K,
18454,3894635.0,Sunnyland Farms,Retail,office associate,provide clerical administrative support manage...,FULL_TIME,"Albany, GA",,39500.0,40K - 45K,
18455,,,,licensed insurance agent,industry hurt year people need insurance posit...,FULL_TIME,"Chico, CA",,48880.0,50K - 55K,
