In [3]:
import datasets
from datasets import load_dataset
import pandas as pd
from scipy.spatial.distance import cosine
import numpy as np # linear algebra
import pandas as pd 
import os

In [4]:
for dirname, _, filenames in os.walk('./data/FinalProject/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/FinalProject/.DS_Store
./data/FinalProject/job_postings.csv
./data/FinalProject/maps/skills.csv
./data/FinalProject/maps/industries.csv
./data/FinalProject/company_details/company_industries.csv
./data/FinalProject/company_details/company_specialities.csv
./data/FinalProject/company_details/companies.csv
./data/FinalProject/company_details/employee_counts.csv
./data/FinalProject/job_details/benefits.csv
./data/FinalProject/job_details/salaries.csv
./data/FinalProject/job_details/job_industries.csv
./data/FinalProject/job_details/job_skills.csv


In [5]:
datasets.__version__

'2.14.6'

In [6]:
dataset_job_listing = load_dataset("csv", data_files="./data/FinalProject/job_postings.csv")
dataset_job_skills = load_dataset("csv", data_files="./data/FinalProject/job_details/job_skills.csv")
dataset_job_benefits = load_dataset("csv", data_files="./data/FinalProject/job_details/benefits.csv")

In [7]:
dataset_job_listing

DatasetDict({
    train: Dataset({
        features: ['job_id', 'company_id', 'title', 'description', 'max_salary', 'med_salary', 'min_salary', 'pay_period', 'formatted_work_type', 'location', 'applies', 'original_listed_time', 'remote_allowed', 'views', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc', 'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type', 'scraped'],
        num_rows: 33246
    })
})

In [8]:
# data preview
dataset_job_listing.set_format("pandas")
dataset_job_listing["train"][:5]

Unnamed: 0,job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,...,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,scraped
0,3757940104,553718.0,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,,5250.0,,MONTHLY,Full-time,"Little River, SC",...,,Entry level,,1699090000000.0,careers-demant.icims.com,0,FULL_TIME,USD,BASE_SALARY,1699138101
1,3757940025,2192142.0,Shipping & Receiving Associate 2nd shift (Beav...,Metalcraft of Mayville\nMetalcraft of Mayville...,,,,,Full-time,"Beaver Dam, WI",...,,,,1699080000000.0,www.click2apply.net,0,FULL_TIME,,,1699085420
2,3757938019,474443.0,"Manager, Engineering",\nThe TSUBAKI name is synonymous with excellen...,,,,,Full-time,"Bessemer, AL",...,,,Bachelor's Degree in Mechanical Engineering pr...,1699080000000.0,www.click2apply.net,0,FULL_TIME,,,1699085644
3,3757938018,18213359.0,Cook,descriptionTitle\n\n Looking for a great oppor...,,22.27,,HOURLY,Full-time,"Aliso Viejo, CA",...,,Entry level,,1699080000000.0,jobs.apploi.com,0,FULL_TIME,USD,BASE_SALARY,1699087461
4,3757937095,437225.0,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",275834.0,,205956.0,YEARLY,Full-time,United States,...,,Mid-Senior level,,1699090000000.0,careers.iherb.com,0,FULL_TIME,USD,BASE_SALARY,1699085346


In [9]:
dataset_job_skills.set_format("pandas")
dataset_job_skills["train"][:5]

Unnamed: 0,job_id,skill_abr
0,3690843087,ACCT
1,3690843087,FIN
2,3691763971,MGMT
3,3691763971,MNFC
4,3691775263,MGMT


In [10]:
dataset_job_benefits.set_format("pandas")
dataset_job_benefits["train"][:5]

Unnamed: 0,job_id,inferred,type
0,3690843087,0,Medical insurance
1,3690843087,0,Dental insurance
2,3690843087,0,401(k)
3,3690843087,0,Paid maternity leave
4,3690843087,0,Disability insurance


In [11]:
dataset_job_benefits = dataset_job_benefits.rename_column(
    original_column_name="type", new_column_name="job_benefits"
)
dataset_job_benefits

DatasetDict({
    train: Dataset({
        features: ['job_id', 'inferred', 'job_benefits'],
        num_rows: 29325
    })
})

## Single Sample Inspection

In [12]:
dataset_job_listing.set_format("numpy")
# dataset_job_listing["train"]["job_id" == 3690843087]
val = dataset_job_listing["train"].filter(lambda x: x["job_id"] == 3690843087)
val[0]

{'job_id': 3690843087,
 'company_id': 6049228.0,
 'title': 'Real Estate Staff Accountant',
 'description': 'Job Description:The Staff Accountant will be responsible for the timely and accurate preparation of financial statements, variance reports, and balance sheet reconciliation for assigned properties. Assist accounting group with general journal entries, bank reconciliations, weekly cash reporting and special projects. Responsibilities include but are not limited to:Prepare monthly financial statements for a portfolio of properties and produce variance reports based on the analyzed results.Maintain schedules supporting balance sheet accounts, including mortgage statement reconciliations, capital schedules and bank reconciliations and submit to VP, Accounting for review.Prepare and post adjusting journal entries to the general ledger.Review income, expense, and capital purchases for accurate coding and payment.Perform bank reconciliations and maintain fixed asset tracking systems.Pre

## How many remote jobs are available?

In [13]:
dataset_job_listing.set_format("pandas")
dataset_job_listing["train"]["remote_allowed"].value_counts(dropna=False)

NaN    28444
1.0     4802
Name: remote_allowed, dtype: int64

# Merge company name to job listing

In [14]:
dataset_company = load_dataset("csv", data_files="./data/FinalProject/company_details/companies.csv")
dataset_company.set_format("pandas")
dataset_company["train"][:5]

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url
0,1009,IBM,"At IBM, we do more than work. We create. We cr...",7.0,NY,US,"Armonk, New York",10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,0,US,Chicago,0,-,https://www.linkedin.com/company/gehealthcare
2,1021,GE Power,"GE Power, part of GE Vernova, is a world energ...",7.0,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,7.0,Texas,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
4,1028,Oracle,We’re a cloud technology company that provides...,7.0,Texas,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle


In [15]:
dataset_company["train"].filter(lambda x: x["company_id"] == 1009)[0]["name"][0]

'IBM'

In [16]:
def add_company_name(sample):
    if sample["company_id"] is not None:
        return {"company_name": dataset_company["train"].filter(lambda x: x["company_id"] == 1009)[0]["name"][0]}
#         return {"company_name": "alpha"}
    else:
        return {"company_name": None}

# Embedding of Descriptions

In [17]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "BAAI/bge-large-en" # can choose different checkpoint for sentence similarity "https://huggingface.co/models?pipeline_tag=sentence-similarity"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [18]:
import torch

try:
    device = torch.device("cuda")
    model.to(device)
    print("Using CUDA")
except Exception as e:
    print(e)
    device = torch.device("cpu")
    model.to(device)
    print("Using CPU")

Torch not compiled with CUDA enabled
Using CPU


In [19]:
# get embedding by pooling the output of last later
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [20]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/mani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [22]:
def get_embeddings(job_listing):
    description_without_stopwords = ' '.join([word for word in job_listing["description"][0].split() if word.lower() not in stop_words])
    # encoded_input = tokenizer(
    #     job_listing["description"][0], padding=True, truncation=True, return_tensors="pt"
    # )
    encoded_input = tokenizer(
        description_without_stopwords, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [23]:
embedding = get_embeddings(dataset_job_listing["train"][0])


In [24]:
embedding.detach().cpu().numpy()[0]

array([-0.19017096, -0.04894827, -0.99637604, ..., -0.24494602,
       -0.38430423,  0.26039067], dtype=float32)

# Train-Validation Split

In [25]:
dataset_job_listing_splited = dataset_job_listing["train"].train_test_split(train_size=0.70, seed=42)
# Rename the default "test" split to "validation"
dataset_job_listing_splited["validation"] = dataset_job_listing_splited.pop("test")
# Add the "test" set to our `DatasetDict`
dataset_job_listing_splited

DatasetDict({
    train: Dataset({
        features: ['job_id', 'company_id', 'title', 'description', 'max_salary', 'med_salary', 'min_salary', 'pay_period', 'formatted_work_type', 'location', 'applies', 'original_listed_time', 'remote_allowed', 'views', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc', 'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type', 'scraped'],
        num_rows: 32913
    })
    validation: Dataset({
        features: ['job_id', 'company_id', 'title', 'description', 'max_salary', 'med_salary', 'min_salary', 'pay_period', 'formatted_work_type', 'location', 'applies', 'original_listed_time', 'remote_allowed', 'views', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc', 'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type', 'scraped'],
    

In [26]:
dataset_job_listing_splited["validation"]

Dataset({
    features: ['job_id', 'company_id', 'title', 'description', 'max_salary', 'med_salary', 'min_salary', 'pay_period', 'formatted_work_type', 'location', 'applies', 'original_listed_time', 'remote_allowed', 'views', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc', 'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type', 'scraped'],
    num_rows: 333
})

In [25]:
embeddings_dataset = dataset_job_listing_splited["validation"].map(
    lambda x: {**{feature: x[feature][0] for feature in x},"embeddings": get_embeddings(x).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/333 [00:00<?, ? examples/s]

In [38]:
type(embeddings_dataset)
from datasets import load_dataset
edf = embeddings_dataset.to_pandas()
edf.to_csv("./out/FinalProject/embeddings_dataset.csv",index=False)

In [27]:
import pickle

# Save embeddings to a file
def save_embeddings(embeddings, filename):
    with open(filename, 'wb') as file:
        pickle.dump(embeddings, file)

# Load embeddings from a file
def load_embeddings(filename):
    with open(filename, 'rb') as file:
        embeddings = pickle.load(file)
    return embeddings

In [32]:
save_embeddings(embeddings_dataset, './out/FinalProject/embeddings.pkl')

In [28]:
embeddings_dataset = load_embeddings('./out/FinalProject/embeddings.pkl')

In [29]:
def search_jobs(search_query, embeddings = embeddings_dataset["embeddings"], k=5):
    # embedding search query
    question = {"description": [search_query]} # similar to the job description from our validation set
    question_embedding = get_embeddings(question).cpu().detach().numpy()
    print(question_embedding)
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print(embeddings)
    # finding similari embeddings
    similarity_scores = list()
    for e in embeddings:
        similarity = 1 - cosine(question_embedding[0], e)
        similarity_scores.append(similarity)
    similarity_scores = np.array(similarity_scores)
    ranks = np.argsort(similarity_scores)
    ranks = ranks[::-1] # revers
    return ranks[:k]

In [30]:
embeddings_dataset.set_format("pandas")

In [31]:
ranks = search_jobs("I need a job for Graduate in data engineering field", k=15)
ranks

[[-0.6668177  -0.1992101  -0.70716935 ... -0.46833998 -0.3380486
   0.1957601 ]]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0      [-0.20165858, -0.1898982, 0.029408064, -0.0751...
1      [0.10283324, 0.32933587, -0.967921, -0.1749288...
2      [0.17482142, -0.12671983, -0.7530961, 0.296593...
3      [-0.25011727, 0.22052401, -1.2164526, 0.762486...
4      [0.2697214, 0.05615599, -0.99282575, 0.3783834...
                             ...                        
328    [-0.30821782, 0.1583257, -0.7329063, 0.7701879...
329    [-0.2622648, 0.15121868, -0.53283894, 0.393095...
330    [0.09616993, -0.22489917, -0.61583275, 0.52114...
331    [0.07531997, -0.3158663, -0.87063944, 0.610536...
332    [-0.09819049, -0.11728428, -0.7926496, 0.56446...
Name: embeddings, Length: 333, dtype: object


array([200,   0, 220, 211, 195, 121, 215,  31, 317, 194,  11, 189, 165,
       268, 122])

In [32]:
data = embeddings_dataset[:].iloc[ranks]

In [33]:
data.head(5)

Unnamed: 0,job_id,company_id,title,description,max_salary,med_salary,min_salary,pay_period,formatted_work_type,location,...,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,scraped,embeddings
200,3757493445,71939.0,Software Engineer,A great opportunity to work with one of our le...,75.0,,73.0,HOURLY,Contract,United States,...,Mid-Senior level,,1699050000000.0,,0,CONTRACT,USD,BASE_SALARY,1699057868,"[-0.22098792, -0.15475607, -1.141057, -0.18972..."
0,3701300194,16690.0,Director Data Architect (20+ years relevant ex...,"Job Title: Director, Data ArchitectLocation: F...",,,,,Contract,"Boston, MA",...,Director,,1692730000000.0,,0,CONTRACT,,,1,"[-0.20165858, -0.1898982, 0.029408064, -0.0751..."
220,3694101880,10801655.0,Senior Data Engineer (Public Sector),Publicis Sapient | Public Sector is looking fo...,135000.0,,108000.0,YEARLY,Full-time,"Arlington, VA",...,Mid-Senior level,,1692840000000.0,,0,FULL_TIME,USD,BASE_SALARY,1,"[-0.33632556, -0.2524207, -0.20332187, 0.45060..."
211,3701323814,11419009.0,Client Analytics Manager,"Hours: Full-time, 40hrs per week, Monday - Fri...",125000.0,,110000.0,YEARLY,Full-time,United States,...,Mid-Senior level,,1692740000000.0,,0,FULL_TIME,USD,BASE_SALARY,1,"[0.030233672, 0.14783004, -0.8847311, 0.163758..."
195,3757453047,18583501.0,Senior/Staff Backend Engineer @ Siftstack.io,SOFTWARE ENGINEER \nAt Sift (https://www.sifts...,,,,,Full-time,"Los Angeles, CA",...,Mid-Senior level,,1699040000000.0,,0,FULL_TIME,,,1699042973,"[-0.6543105, -0.22370934, -0.53103113, 0.06223..."


In [34]:
data.columns

Index(['job_id', 'company_id', 'title', 'description', 'max_salary',
       'med_salary', 'min_salary', 'pay_period', 'formatted_work_type',
       'location', 'applies', 'original_listed_time', 'remote_allowed',
       'views', 'job_posting_url', 'application_url', 'application_type',
       'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc',
       'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'scraped', 'embeddings'],
      dtype='object')

In [35]:
columns1 = ['job_id','title','description','formatted_work_type','location','remote_allowed','job_posting_url','application_url','formatted_experience_level','sponsored','work_type']

In [36]:
data_sub = data[columns1]

In [50]:
res = data_sub.to_json(orient='records', lines=True)

In [51]:
import json
# Parse each JSON object and store in a list
res = res.strip().split('\n')
json_dicts = []
for idx, obj in enumerate(res, start=1):
    try:
        json_dict = json.loads(obj)
        json_dicts.append(json_dict)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON object {idx}: {e}")
        print(f"Problematic JSON object {idx}: {obj}")

In [53]:
json_dicts[0]

{'job_id': 3757493445,
 'title': 'Software Engineer',
 'description': 'A great opportunity to work with one of our leading Clients.\nSoftware EngineerLocation:- Remote - USADuration :- 12 Months\nResponsibilities of Software EngineerActively participate in modernization of legacy application in moving application from Mainframe to distributed systems Develop subject matter expertise in File One Data Load by providing analysis/design, code, testing and documentation Work with Cross-Functional teams to program, validate and co-ordinate software releasesProvide production application support in a 24x7 environment. Provide technical direction assistance to development and testing staff.Participate in or conduct code reviews and provide program improvement recommendations. \nRequirements for Software EngineerDrive architecture decisions in collaboration with software architects, security team, and hardware teamsProvide analysis and coding for complex software assignments.Possess proficient 