In [19]:
import numpy as np
import pandas as pd

# Load data
# Load the dataset, selecting only the specified columns
cols = ['company_id', 'title', 'description', 'max_salary', 'med_salary', 'min_salary', 'pay_period', 'formatted_work_type', 'location', 'work_type', 'formatted_experience_level']
data = pd.read_csv("./raw_data/job_postings.csv", usecols=cols)

company_industries = pd.read_csv("./raw_data/company_details/company_industries.csv")
companies = pd.read_csv("./raw_data/company_details/companies.csv")


# Filter out entries lacking salary data
data = data.dropna(how="all", subset=['max_salary','med_salary', 'min_salary'])

In [None]:
data

Unnamed: 0,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,formatted_experience_level,work_type
0,553718.0,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,,5250.00,,MONTHLY,Full-time,"Little River, SC",Entry level,FULL_TIME
3,18213359.0,Cook,descriptionTitle\n\n Looking for a great oppor...,,22.27,,HOURLY,Full-time,"Aliso Viejo, CA",Entry level,FULL_TIME
4,437225.0,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",275834.0,,205956.0,YEARLY,Full-time,United States,Mid-Senior level,FULL_TIME
8,18213359.0,Dishwasher,"descriptionTitle\n\n $2,000 Sign-on Bonus Guar...",,19.30,,HOURLY,Full-time,"Aliso Viejo, CA",Entry level,FULL_TIME
11,19181907.0,Insights Analyst - Auto Industry,Who We Are\n\nEscalent is an award-winning dat...,64000.0,,58000.0,YEARLY,Full-time,United States,Entry level,FULL_TIME
...,...,...,...,...,...,...,...,...,...,...,...
33231,,Mental Health Practitioners,Gail M. Yost and Associates is hiring full tim...,,100000.00,,YEARLY,Full-time,"Minneapolis, MN",,FULL_TIME
33233,61469.0,Registered Nurse (RN) Vaccinator,United Staffing Solutions is partnering with o...,50.0,,50.0,HOURLY,Part-time,"Muskegon, MI",,PART_TIME
33236,3894635.0,Office Associate,Provide clerical and administrative support to...,42000.0,,37000.0,YEARLY,Full-time,"Albany, GA",,FULL_TIME
33244,,Licensed Insurance Agent,While many industries were hurt by the last fe...,52000.0,,45760.0,YEARLY,Full-time,"Chico, CA",,FULL_TIME


In [20]:
from data_processing import standardize_salary
  
"""Standardize salary"""  
data['standardized_annual_salary'] = data.apply(standardize_salary, axis=1)


In [21]:
"""Label rows by salary interval"""  
bins = [0] + list(range(10000, 160000, 10000)) + [float('inf')]
labels = ['10K-'] + [f"{i}K - {i + 10}K" for i in range(10, 150, 10)] + ['160K+']

data['salary_level'] = pd.cut(data['standardized_annual_salary'], bins=bins, labels=labels, right=False)

In [22]:
data

Unnamed: 0,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,formatted_experience_level,work_type,standardized_annual_salary,salary_level
0,553718.0,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,,5250.00,,MONTHLY,Full-time,"Little River, SC",Entry level,FULL_TIME,63000.0,60K - 70K
3,18213359.0,Cook,descriptionTitle\n\n Looking for a great oppor...,,22.27,,HOURLY,Full-time,"Aliso Viejo, CA",Entry level,FULL_TIME,42758.4,40K - 50K
4,437225.0,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",275834.0,,205956.0,YEARLY,Full-time,United States,Mid-Senior level,FULL_TIME,240895.0,160K+
8,18213359.0,Dishwasher,"descriptionTitle\n\n $2,000 Sign-on Bonus Guar...",,19.30,,HOURLY,Full-time,"Aliso Viejo, CA",Entry level,FULL_TIME,37056.0,30K - 40K
11,19181907.0,Insights Analyst - Auto Industry,Who We Are\n\nEscalent is an award-winning dat...,64000.0,,58000.0,YEARLY,Full-time,United States,Entry level,FULL_TIME,61000.0,60K - 70K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33231,,Mental Health Practitioners,Gail M. Yost and Associates is hiring full tim...,,100000.00,,YEARLY,Full-time,"Minneapolis, MN",,FULL_TIME,100000.0,100K - 110K
33233,61469.0,Registered Nurse (RN) Vaccinator,United Staffing Solutions is partnering with o...,50.0,,50.0,HOURLY,Part-time,"Muskegon, MI",,PART_TIME,96000.0,90K - 100K
33236,3894635.0,Office Associate,Provide clerical and administrative support to...,42000.0,,37000.0,YEARLY,Full-time,"Albany, GA",,FULL_TIME,39500.0,30K - 40K
33244,,Licensed Insurance Agent,While many industries were hurt by the last fe...,52000.0,,45760.0,YEARLY,Full-time,"Chico, CA",,FULL_TIME,48880.0,40K - 50K


In [None]:
from src.data_processing.data_processing import clean_description
import spacy

"""Clean descriptions"""  
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
data['cleaned_description'] = data['description'].apply(lambda x: clean_description(x, nlp))


In [None]:
from src.data_processing.data_processing import clean_title

"""Clean title"""  
data['cleaned_title'] = data['title'].apply(lambda x: clean_title(x, nlp))

In [None]:
from src.data_processing.data_processing import generate_vocabulary
from src.data_processing.data_processing import create_title_emb
 
"""Create embedding for title"""  
vocab = generate_vocabulary(data['cleaned_title'])
word_to_vec = {word: np.eye(len(vocab))[i] for i, word in enumerate(vocab)}
data['title_emb'] = data['cleaned_title'].apply(lambda x: create_title_emb(x, word_to_vec))

In [None]:
data['title_emb']

0        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                               ...                        
18452    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
18453    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
18454    [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
18455    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
18456    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: title_emb, Length: 18457, dtype: object

In [None]:
# Merge company data
data = data.merge(companies, on='company_id', how='left')
data = data.merge(company_industries, on='company_id', how='left')



final_processed_data = data[[
    'company_id', 'name', 'industry', 'cleaned_title', 'cleaned_description', 'work_type',
    'location', 'formatted_experience_level', 'standardized_annual_salary', 'salary_level', 'title_emb'
]]

final_processed_data.to_csv("../data/processed_job_postings.csv", index=False)

In [26]:
final_processed_data[:15]

Unnamed: 0,company_id,name,industry,cleaned_title,cleaned_description,work_type,location,formatted_experience_level,standardized_annual_salary,salary_level,title_emb
0,553718.0,HearingLife,Retail,hearing care provider,overview HearingLife national hearing care com...,FULL_TIME,"Little River, SC",Entry level,63000.0,65K - 70K,
1,18213359.0,Episcopal Communities & Services,Non-profit Organization Management,cook,descriptiontitle look great opportunity develo...,FULL_TIME,"Aliso Viejo, CA",Entry level,42758.4,45K - 50K,
2,18213359.0,Episcopal Communities & Services,Non-profit Organizations,cook,descriptiontitle look great opportunity develo...,FULL_TIME,"Aliso Viejo, CA",Entry level,42758.4,45K - 50K,
3,437225.0,"iHerb, LLC",Retail,principal cloud security architect remote,Job Summary iHerb mission health wellness acce...,FULL_TIME,United States,Mid-Senior level,240895.0,150K+,
4,18213359.0,Episcopal Communities & Services,Non-profit Organization Management,dishwasher,descriptiontitle sign Bonus guarantee look foo...,FULL_TIME,"Aliso Viejo, CA",Entry level,37056.0,40K - 45K,
...,...,...,...,...,...,...,...,...,...,...,...
18452,,,,mental health practitioners,Gail Yost Associates hire time license mental ...,FULL_TIME,"Minneapolis, MN",,100000.0,100K - 105K,
18453,61469.0,United Staffing Solutions (USS),Staffing & Recruiting,registered nurse rn vaccinator,United Staffing Solutions partner big covid va...,PART_TIME,"Muskegon, MI",,96000.0,100K - 105K,
18454,3894635.0,Sunnyland Farms,Retail,office associate,provide clerical administrative support manage...,FULL_TIME,"Albany, GA",,39500.0,40K - 45K,
18455,,,,licensed insurance agent,industry hurt year people need insurance posit...,FULL_TIME,"Chico, CA",,48880.0,50K - 55K,
