In [5]:
#for manipulating with the dataframe
import pandas as pd
#to manipulate with data
import numpy as np
# Configuration
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
# For splitting the data into train and test
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
import nltk
import time

In [6]:
df_skills = pd.read_csv('../data/job_skills.csv')
df_postings = pd.read_csv('../data/linkedin_job_postings.csv')

In [7]:
# Delete data with empty data
df_skills.dropna(inplace=True)
df_postings.dropna(inplace=True)

In [8]:
df_postings.drop(columns = ['last_processed_time','first_seen'],inplace = True)

In [9]:
df_postings.drop(columns = ['got_summary','got_ner','is_being_worked'],inplace = True)

In [10]:
df_merged = pd.merge(df_postings, df_skills, on='job_link', how = 'inner')
df_merged.drop(['job_link', 'job_title','company','job_location','search_country','search_city','job_type'], axis=1, inplace=True)

In [11]:
df_merged['search_position'] = df_merged['search_position'].str.lower()
df_merged['search_position'] = df_merged['search_position'].str.findall(r'\w{3,}').str.join(' ')
df_merged['job_skills'] = df_merged['job_skills'].str.replace('[^a-zA-Z, ]', '', regex=True)
df_merged['job_skills'] = df_merged['job_skills'].str.lower()

df_merged['job_skills'] = df_merged['job_skills']

In [12]:
df_merged

Unnamed: 0,search_position,job_level,job_skills
0,color maker,Mid senior,"medical equipment sales, key competitors, term..."
1,director nursing service,Mid senior,"nursing, bachelor of science in nursing, maste..."
2,stand,Mid senior,"restaurant operations management, inventory ma..."
3,real estate clerk,Mid senior,"real estate, customer service, sales, negotiat..."
4,nurse practitioner,Mid senior,"nursing, bsn, medical license, virtual rn, nur..."
...,...,...,...
1294263,nurse supervisor,Mid senior,"registered nurse, bls certification, nursing c..."
1294264,assistant construction superintendent,Mid senior,"construction management, project planning, est..."
1294265,chef,Mid senior,"culinary, chef director, menu writing, cycle o..."
1294266,occupational analyst,Mid senior,"registered nurse, analyst, rn registered nurse..."


In [13]:
# Exclude search_positions with less than 500 rows
# This is to exclude job titles and skills with small amount of data
df_group = df_merged.groupby("search_position").filter(lambda g: len(g) > 500)
df_group

Unnamed: 0,search_position,job_level,job_skills
0,color maker,Mid senior,"medical equipment sales, key competitors, term..."
1,director nursing service,Mid senior,"nursing, bachelor of science in nursing, maste..."
2,stand,Mid senior,"restaurant operations management, inventory ma..."
3,real estate clerk,Mid senior,"real estate, customer service, sales, negotiat..."
4,nurse practitioner,Mid senior,"nursing, bsn, medical license, virtual rn, nur..."
...,...,...,...
1294262,director athletic,Mid senior,"account management, commercial insurance, micr..."
1294263,nurse supervisor,Mid senior,"registered nurse, bls certification, nursing c..."
1294264,assistant construction superintendent,Mid senior,"construction management, project planning, est..."
1294265,chef,Mid senior,"culinary, chef director, menu writing, cycle o..."


In [14]:
df_group['job_skills_s'] = df_group['job_skills'].str.split(',')
df_group

Unnamed: 0,search_position,job_level,job_skills,job_skills_s
0,color maker,Mid senior,"medical equipment sales, key competitors, term...","[medical equipment sales, key competitors, t..."
1,director nursing service,Mid senior,"nursing, bachelor of science in nursing, maste...","[nursing, bachelor of science in nursing, ma..."
2,stand,Mid senior,"restaurant operations management, inventory ma...","[restaurant operations management, inventory ..."
3,real estate clerk,Mid senior,"real estate, customer service, sales, negotiat...","[real estate, customer service, sales, nego..."
4,nurse practitioner,Mid senior,"nursing, bsn, medical license, virtual rn, nur...","[nursing, bsn, medical license, virtual rn,..."
...,...,...,...,...
1294262,director athletic,Mid senior,"account management, commercial insurance, micr...","[account management, commercial insurance, m..."
1294263,nurse supervisor,Mid senior,"registered nurse, bls certification, nursing c...","[registered nurse, bls certification, nursin..."
1294264,assistant construction superintendent,Mid senior,"construction management, project planning, est...","[construction management, project planning, ..."
1294265,chef,Mid senior,"culinary, chef director, menu writing, cycle o...","[culinary, chef director, menu writing, cyc..."


In [15]:
# Function to combine name and skills into a new list
def combine_name_skills(name, skills):
    return [name] + skills  # or skills + [name] to add at the end

# Apply function to DataFrame
df_group['Combined'] = df_group.apply(lambda row: combine_name_skills(row['search_position'], row['job_skills_s']), axis=1)
len(df_group)

1141724

In [16]:
# def filter_common_skills(skill_lists, min_frequency=2):
#     from collections import Counter
#     # Flatten the list of skills and count occurrences
#     skill_counter = Counter(skill for skills in skill_lists for skill in skills)
#     # Filter skills that appear at least 'min_frequency' times
#     common_skills = [skill for skill, count in skill_counter.items() if count >= min_frequency]
#     return common_skills

# # Aggregate and filter skills by job title
# df_aggregated = df.groupby('Job Title')['Skills'].agg(filter_common_skills).reset_index()
# df_aggregated

In [17]:
# df_aggregated = df_group.groupby('search_position')['Combined'].agg(lambda x: list(set(sum(x, [])))).reset_index()
# # df_aggregated = df_group

df_aggregated = df_group.groupby('search_position')['Combined'].agg(lambda x: list(sum(x, []))).reset_index()


In [18]:
df_aggregated

Unnamed: 0,search_position,Combined
0,abstractor,"[abstractor, public administration, business,..."
1,account executive,"[account executive, account management, relat..."
2,accountant,"[accountant, general accounting, erp, micros..."
3,accountant cost,"[accountant cost, sap configuration, product ..."
4,accountant systems,"[accountant systems, construction accounting, ..."
5,accountant tax,"[accountant tax, tax compliance, wealth trans..."
6,accounting clerk,"[accounting clerk, senior accountant, ifrs, ..."
7,acrobat,"[acrobat, proposal writing, proposal automati..."
8,acupuncturist,"[acupuncturist, physical therapy, rehabilitat..."
9,administrative assistant,"[administrative assistant, emr, epic, billin..."


In [19]:
# Remove job skills that are empty
df_filtered = df_aggregated[df_aggregated['Combined'].apply(lambda x: '' not in x)]
df_filtered

Unnamed: 0,search_position,Combined
0,abstractor,"[abstractor, public administration, business,..."
1,account executive,"[account executive, account management, relat..."
2,accountant,"[accountant, general accounting, erp, micros..."
3,accountant cost,"[accountant cost, sap configuration, product ..."
4,accountant systems,"[accountant systems, construction accounting, ..."
5,accountant tax,"[accountant tax, tax compliance, wealth trans..."
6,accounting clerk,"[accounting clerk, senior accountant, ifrs, ..."
7,acrobat,"[acrobat, proposal writing, proposal automati..."
8,acupuncturist,"[acupuncturist, physical therapy, rehabilitat..."
9,administrative assistant,"[administrative assistant, emr, epic, billin..."


In [20]:
df_filtered = df_filtered.reset_index()

In [21]:
# df_ss = df_group.groupby('search_position').agg({'Combined': 'sum'})
# df_ss

In [22]:
# # !conda install gensim
from gensim.models import Word2Vec

# train word2vec model
model = Word2Vec(df_filtered['Combined'], vector_size=100, window=5, min_count=1, workers=4)


In [23]:
import numpy as np

def document_vector(word_list):
    # remove out-of-vocabulary words
    doc = [word for word in word_list if word in model.wv.key_to_index]
    if len(doc) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)

# Apply the function to each row
df_filtered['doc_vector'] = df_filtered['Combined'].apply(document_vector)

In [34]:
df_filtered

Unnamed: 0,index,search_position,Combined,doc_vector
0,0,abstractor,"[abstractor, public administration, business,...","[0.38421735, -0.8510864, 1.019886, 0.71054566,..."
1,1,account executive,"[account executive, account management, relat...","[0.2180204, -0.5365167, 0.69693565, 1.118836, ..."
2,2,accountant,"[accountant, general accounting, erp, micros...","[0.77485955, -1.0775172, 0.76536256, 0.8911524..."
3,3,accountant cost,"[accountant cost, sap configuration, product ...","[0.5332589, -0.9293834, 0.9678064, 1.125499, -..."
4,4,accountant systems,"[accountant systems, construction accounting, ...","[0.73961276, -0.9320195, 0.6081383, 0.88356626..."
5,5,accountant tax,"[accountant tax, tax compliance, wealth trans...","[0.6395722, -0.7959441, 0.62435246, 1.0259151,..."
6,6,accounting clerk,"[accounting clerk, senior accountant, ifrs, ...","[0.7934206, -1.0653226, 0.81281275, 0.7049812,..."
7,7,acrobat,"[acrobat, proposal writing, proposal automati...","[0.43811354, -0.710996, 0.4984055, 0.6619288, ..."
8,8,acupuncturist,"[acupuncturist, physical therapy, rehabilitat...","[-0.02361055, 0.02540674, 1.958216, 1.0367135,..."
9,9,administrative assistant,"[administrative assistant, emr, epic, billin...","[0.23388353, -0.580123, 1.0671891, 0.8683425, ..."


In [40]:
df_filtered.to_csv('job_positions.csv', columns=['search_position'],index=True)
# Convert DataFrame column of lists to a 2D array for cosine similarity
job_vectors = np.array(list(df_filtered['doc_vector']))
np.save("job_vectors", job_vectors)

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

# Example new job skills
new_job_skills = ['program', 'develop', 'software', 'java', 'c++']
new_job_vector = document_vector(new_job_skills)

# Convert DataFrame column of lists to a 2D array for cosine similarity
job_vectors = np.array(list(df_filtered['doc_vector']))

# Compute cosine similarity
similarities = cosine_similarity([new_job_vector], job_vectors)

# Find the index of the highest similarity score
most_similar_job_index = similarities.argmax()
most_similar_job = df_filtered.iloc[most_similar_job_index]['search_position']
print("The most similar job is:", most_similar_job)

The most similar job is: programmer analyst


In [41]:
from sklearn.metrics.pairwise import cosine_similarity

# Example new job skills
new_job_skills = ['program', 'develop', 'software', 'java', 'c++']
new_job_vector = document_vector(new_job_skills)

# Compute cosine similarity
similarities = cosine_similarity([new_job_vector], job_vectors).flatten()

# Get the top 5 similar jobs indices and scores
top_indices = similarities.argsort()[-5:][::-1]
top_indices
# top_similar_jobs = df_filtered.iloc[top_indices]
# top_confidence_scores = similarities[top_indices]

# # Display the results
# results = pd.DataFrame({
#     'Job': top_similar_jobs['search_position'],
#     'Confidence Score': top_confidence_scores
# })
# print(results)

array([344,  97,  78, 393, 440])

In [26]:
# Example new job skills
new_job_skills = ['culinary', 'chef director', 'menu writing', 'cook', 'customer service'  ]
new_job_vector = document_vector(new_job_skills)

# Compute cosine similarity
similarities = cosine_similarity([new_job_vector], job_vectors).flatten()

# Get the top 5 similar jobs indices and scores
top_indices = similarities.argsort()[-5:][::-1]
top_similar_jobs = df_filtered.iloc[top_indices]
top_confidence_scores = similarities[top_indices]

# Display the results
results = pd.DataFrame({
    'Job': top_similar_jobs['search_position'],
    'Confidence Score': top_confidence_scores
})
print(results)

                                          Job  Confidence Score
42                                  bartender          0.859576
74                              check cashier          0.857154
462                           waiter waitress          0.855803
382  sales representative general merchandise          0.855766
116                        counter supervisor          0.855170


In [27]:
# Example new job skills
new_job_skills = ['music', 'teamwork', 'senior', 'manager' ]
new_job_vector = document_vector(new_job_skills)

# Compute cosine similarity
similarities = cosine_similarity([new_job_vector], job_vectors).flatten()

# Get the top 5 similar jobs indices and scores
top_indices = similarities.argsort()[-5:][::-1]
top_similar_jobs = df_filtered.iloc[top_indices]
top_confidence_scores = similarities[top_indices]

# Display the results
results = pd.DataFrame({
    'Job': top_similar_jobs['search_position'],
    'Confidence Score': top_confidence_scores
})
print(results)

                  Job  Confidence Score
75               chef          0.920394
212      hair stylist          0.917652
186             extra          0.917547
317  personal shopper          0.916931
109              cook          0.914971


In [28]:
# Example new job skills
new_job_skills = ['health', 'medical', 'registered', 'care', "nursing"]
new_job_vector = document_vector(new_job_skills)

# Compute cosine similarity
similarities = cosine_similarity([new_job_vector], job_vectors).flatten()

# Get the top 5 similar jobs indices and scores
top_indices = similarities.argsort()[-10:][::-1]
top_similar_jobs = df_filtered.iloc[top_indices]
top_confidence_scores = similarities[top_indices]

# Display the results
results = pd.DataFrame({
    'Job': top_similar_jobs['search_position'],
    'Confidence Score': top_confidence_scores
})
print(results)

                                Job  Confidence Score
298                   nurse midwife          0.835621
303                    nurse school          0.833545
306  nurse supervisor evening night          0.828687
80          christian science nurse          0.827962
305                nurse supervisor          0.827339
301              nurse practitioner          0.826397
292               nurse anesthetist          0.824979
302              nurse private duty          0.824506
160            dye house supervisor          0.823942
297        nurse licensed practical          0.822614


In [29]:
# Example new job skills
new_job_skills = ['data', 'deep learning', 'cnn', 'machine learning', "analytics"]
new_job_vector = document_vector(new_job_skills)

# Compute cosine similarity
similarities = cosine_similarity([new_job_vector], job_vectors).flatten()

# Get the top 5 similar jobs indices and scores
top_indices = similarities.argsort()[-10:][::-1]
top_similar_jobs = df_filtered.iloc[top_indices]
top_confidence_scores = similarities[top_indices]

# Display the results
results = pd.DataFrame({
    'Job': top_similar_jobs['search_position'],
    'Confidence Score': top_confidence_scores
})
print(results)

# There are no "ML" related jobs on search_position, so likely can't find anything

                                   Job  Confidence Score
78           chief computer programmer          0.937310
344                 programmer analyst          0.935168
393                  software engineer          0.933106
97                 computer programmer          0.931406
440              test fixture designer          0.930503
169       electrical research engineer          0.926918
129            data base administrator          0.926612
12      agricultural research engineer          0.920915
99   computer systems hardware analyst          0.918287
249           maintenance data analyst          0.914225


In [32]:
model.save("word2vec_jobs.model")
print(model)

Word2Vec<vocab=2457697, vector_size=100, alpha=0.025>
