# Analysis of Job Descriptions

In [40]:
# import data and libraries
import pandas as pd

# read in data
df = pd.read_csv('../2023-04-14-job-search/Clean_Data/combined_data_final.csv')

df.head()

Unnamed: 0,title,company_name,location,via,description,detected_extensions.schedule_type,detected_extensions.work_from_home,detected_extensions.posted_at,detected_extensions.salary,search_parameters.q,Qualifications,Responsibilities,Benefits
0,Ethereum Blockchain Developer (Remote),Ex Populus,Anywhere,via Built In,Company Overview:\nEx Populus is a cutting-edg...,Full-time,True,,,block chain,"2-3 years of Software Development experience,1...","Design, maintain and deploy smart contracts fo...",
1,Blockchain Engineer,21.co,"New York, NY",via Greenhouse,We are seeking a highly motivated and skilled ...,Full-time,,,,block chain,Bachelor's or Master's degree in Computer Scie...,"As a Blockchain Engineer, you will be responsi...",(NYC only) Pursuant to Section 8-102 of title ...
2,Blockchain Course Instructor,Blockchain Institute of Technology,Anywhere,via LinkedIn,"Are you a blockchain, cryptocurrency, NFT, Met...",Contractor,True,24 hours ago,,block chain,"3+ years of experience in blockchain, cryptocu...",Our expert technical team will provide the sup...,
3,Python based - Blockchain developer to join ex...,Upwork,Anywhere,via Upwork,Need someone to join our existing team to spee...,Contractor,True,2 days ago,10–30 an hour,block chain,"Candidates must be willing to sign, non-disclo...",Will discuss details with the selected candidates,
4,Blockchain DevOps Engineer (Remote),Telnyx,United States,via Startup Jobs,"About Telnyx\n\nAt Telnyx, we’re architecting ...",Full-time,,4 days ago,,block chain,You are a highly motivated and experienced Blo...,To build a best-in-class Filecoin (FIL) Mining...,


In [2]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

In [33]:
# isna check
df.isna().sum()

title                                   0
company_name                            0
location                                0
via                                     0
description                             0
detected_extensions.schedule_type       1
detected_extensions.work_from_home    496
detected_extensions.posted_at         212
detected_extensions.salary            525
search_parameters.q                     0
Qualifications                          0
Responsibilities                       85
Benefits                              310
dtype: int64

## Use LSH to predict the suitable job for a given keyword

In [1]:
# load packages
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [16]:
# preprocessing data
def preprocess(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\s\s+',' ',text)
    text = text.strip()
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [4]:
# choose parameters

#Number of Permutations
permutations = 128

#Number of Recommendations to return
num_recommendations = 1

In [5]:
# Create MinHash objects
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [11]:
# evaluate query
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]
    # select columns in the result
    result = result[['title', 'company_name', 'location', 'via', 'description', 
                     'detected_extensions.schedule_type', 'detected_extensions.work_from_home',
                     'detected_extensions.posted_at', 'detected_extensions.salary',
                     'Qualifications', 'Responsibilities', 'Benefits'
                     ]]
    result.columns = ['Job Title', 'Company Name', 'Location', 'Platform', 'Description',
                      'Schedule Type', 'Work from Home', 'Posted at', 'Salary',
                      'Qualifications', 'Responsibilities', 'Benefits']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

### Test Recommendation Engine

In [30]:
# replace nan with empty string
df['Responsibilities'] = df['Responsibilities'].fillna('')
df['Benefits'] = df['Benefits'].fillna('')
df['detected_extensions.schedule_type'] = df['detected_extensions.schedule_type'].fillna('')
df['detected_extensions.work_from_home'] = df['detected_extensions.work_from_home'].fillna('')
df['detected_extensions.posted_at'] = df['detected_extensions.posted_at'].fillna('')
df['detected_extensions.salary'] = df['detected_extensions.salary'].fillna('')

In [31]:
db = df
db['text'] = df['title'] + ' ' + df['company_name'] + ' ' + df['location'] + ' ' + df['description'] + ' ' + df['Qualifications'] + ' ' + df['Responsibilities'] + ' ' + df['Benefits']
forest = get_forest(db, permutations)

It took 7.768658876419067 seconds to build forest.


In [37]:
num_recommendations = 5
title = 'Data Analyst in DC'
result = predict(title, db, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.015410184860229492 seconds to query forest.

 Top Recommendation(s) is(are) 
                                              Job Title Company Name  \
365                 Research Intern - Machine Learning    Microsoft   
82   Online R, NLP, Natural Language Processing tea...    TeacherOn   
178  Expert on Graph Neural Networks applied to Soc...       Upwork   
374  Need an expert to consult on GNNs (graph neura...       Upwork   
317  Reinforcement Learning Developer for Stock Tra...       Upwork   

                   Location               Platform  \
365          Redmond, WA     via Microsoft Careers   
82     Silver Spring, MD                via Jooble   
178               Anywhere              via Upwork   
374               Anywhere              via Upwork   
317               Anywhere              via Upwork   

                                           Description Schedule Type  \
365  Research Internships at Microsoft provide a dy...    Internship   
82              