In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.decomposition import TruncatedSVD
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

### Tfidf Vectorizer Version

In [11]:
job_skills_df = pd.read_csv("./data/linkedin_job_postings.csv")

In [12]:
normal_jobs_df = pd.read_csv("./data/All_Occupations.csv")
normal_jobs_list = list(normal_jobs_df["Occupation"])

In [27]:
job_titles_for_mapping = list(job_skills_df["job_title"])[:1000]

In [38]:
job_skills_df["job_title"].value_counts()

job_title
LEAD SALES ASSOCIATE-FT                                                                                    7325
Shift Manager                                                                                              5818
First Year Tax Professional                                                                                5356
Assistant Manager                                                                                          5346
Customer Service Representative                                                                            5203
                                                                                                           ... 
Assistant Salon Manager - Villages at Waterville Landing                                                      1
Senior Specialist, Program Scheduler                                                                          1
Retail Store Associate Warehouse Part Time Homesense - Sherwood Park Square                   

In [None]:
vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(job_titles_for_mapping)

svd = TruncatedSVD(n_components=100)
reduced_vectors = svd.fit_transform(tfidf_vectors)

In [None]:
known_positions = normal_jobs_list
known_vectors = vectorizer.transform(known_positions)

In [20]:
import json

In [30]:
# calculate similarity
similarity = cosine_similarity(tfidf_vectors, known_vectors)

# get mapping
mapped_positions = {job_titles_for_mapping[i]: known_positions[np.argmax(sim)] for i, sim in enumerate(similarity)}

with open("result_tfidf.json", "w") as f:
    json.dump(mapped_positions, f)

### Sentence Transformer Version

In [31]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# job_titles = list(job_skills_df["job_title"].unique())[:1000]
# known_positions = normal_jobs_list
# usa_df = job_skills_df[job_skills_df["search_country"] == "United States"]
# len(usa_df["job_title"].unique()), len(job_skills_df["job_title"])

(495565, 1348454)

In [None]:
# Load the model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [45]:
usa_df = job_skills_df[job_skills_df["search_country"] == "United States"]
job_titles = list(usa_df["job_title"].unique())[:100000]

In [None]:
batch_size = 10000
job_title_embeddings = []


for i in range(0, len(job_titles), batch_size):
    batch = job_titles[i:i + batch_size]
    batch_embeddings = model.encode(batch, show_progress_bar=True)
    job_title_embeddings.extend(batch_embeddings)


# Encode job titles and known positions
# job_title_embeddings = model.encode(job_titles)
known_position_embeddings = model.encode(known_positions)

# Compute cosine similarity
similarity = cosine_similarity(job_title_embeddings, known_position_embeddings)

# Map each job title to the most similar known position
mapped_positions = {
    job_titles[i]: known_positions[np.argmax(sim)]
    for i, sim in enumerate(similarity)
}

with open("result_mini_llm.json", "w") as f:
    json.dump(mapped_positions, f)

Iteration 0/5


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Iteration 1/5


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Iteration 2/5


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Iteration 3/5


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Iteration 4/5


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Iteration 5/5


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Iteration 6/5


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

KeyboardInterrupt: 