In [1]:
import pandas as pd

In [2]:
#Read Dataset
df_combined = pd.read_csv("../../Datasets/processed_data.csv")

In [14]:
df_combined.shape

(6243, 12)

In [5]:
df_combined.isna().sum()

url                      0
job_title                0
description_html         0
description              0
job_type               459
company                  0
location                 0
description_tokens       0
full_info_tokens         0
duplicated               0
min_pay               3580
max_pay               3580
dtype: int64

In [6]:
#Overview of dataset
df_combined.head()

Unnamed: 0,url,job_title,description_html,description,job_type,company,location,description_tokens,full_info_tokens,duplicated,min_pay,max_pay
0,https://www.mycareersfuture.gov.sg/job/custome...,PRODUCTION CONTROL MANAGER,<p><strong>JOB DESCIPTION</strong></p>\n<ul>\n...,JOB DESCIPTION\n\n planning and organising pr...,"Permanent, Full Time",Snl Logistics Pte Ltd,31 GUL CIRCLE 629569,"['job', 'descipt', 'plan', 'organis', 'product...","['product', 'control', 'manag', 'snl', 'logist...",False,2000.0,3400.0
1,https://www.mycareersfuture.gov.sg/job/enginee...,Design Engineer ( Mechanical / Electrical),<p><strong>SUMMARY</strong></p>\n<ul>\n <li>T...,SUMMARY\n\n This position is responsible...,Full Time,Jamco Aero Design &Amp; Engineering Private Li...,Singapore,"['summari', 'posit', 'respons', 'support', 'pr...","['design', 'engin', 'mechan', 'electr', 'jamco...",False,2500.0,4500.0
2,https://www.mycareersfuture.gov.sg/job/sales/b...,Business Development Executive,<p><strong>Job description</strong></p>\n<p>Wh...,Job description\nWho we are:\nWe are a logisti...,"Part Time, Permanent",Airpak Express Pte Ltd,"TECHPLAS INDUSTRIAL BUILDING, 45 CHANGI SOUTH ...","['job', 'descript', 'logist', 'servic', 'provi...","['busi', 'develop', 'execut', 'airpak', 'expre...",False,3200.0,3500.0
3,https://www.mycareersfuture.gov.sg/job/banking...,Senior / Data Scientist,<p>The ideal candidate should have a good unde...,The ideal candidate should have a good underst...,"Permanent, Full Time",Singapore Exchange Limited,"SGX CENTRE I, 2 SHENTON WAY 068804","['ideal', 'candid', 'good', 'understand', 'bus...","['senior', 'data', 'scientist', 'singapor', 'e...",False,9000.0,14000.0
4,https://www.mycareersfuture.gov.sg/job/archite...,8890-Sales Consultant [ Digital Software| Saas...,<p><strong>Sales Consultant (Digital Software)...,Sales Consultant (Digital Software)\nLocation:...,"Permanent, Full Time",The Supreme Hr Advisory Pte. Ltd.,"SHENTON HOUSE, 3 SHENTON WAY 068805","['sale', 'consult', 'digit', 'softwar', 'locat...","['8890', 'sale', 'consult', 'digit', 'softwar'...",False,3000.0,4500.0


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Set ngram range to 2 to include phrases
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [8]:
#Fit full_info_tokens onto the tfidf vectorizer
tfidf_job = tfidf_vectorizer.fit_transform(df_combined['full_info_tokens'])

In [9]:
#Check if tfidf_vectorizer worked correctly
tfidf_vectorizer.vocabulary_

{'product': 258349,
 'control': 76638,
 'manag': 201004,
 'snl': 310443,
 'logist': 196933,
 'pte': 265929,
 'ltd': 198194,
 'perman': 242372,
 'full': 142112,
 'time': 340464,
 '31': 4082,
 'gul': 152404,
 'circl': 58601,
 '629569': 5636,
 'job': 180426,
 'descipt': 94528,
 'plan': 245340,
 'organis': 232367,
 'schedul': 295203,
 'assess': 31688,
 'project': 262031,
 'resourc': 284227,
 'requir': 281714,
 'estim': 120747,
 'negoti': 220803,
 'agre': 16727,
 'budget': 46787,
 'timelin': 341362,
 'client': 60170,
 'ensur': 116287,
 'health': 154352,
 'safeti': 292573,
 'regul': 277083,
 'met': 209914,
 'determin': 96868,
 'qualiti': 268335,
 'standard': 318505,
 'overse': 234681,
 'process': 256971,
 'select': 299653,
 'order': 231470,
 'purchas': 266341,
 'materi': 205801,
 'repair': 280108,
 'routin': 291360,
 'mainten': 199769,
 'equip': 118913,
 'liais': 192379,
 'buyer': 49111,
 'supervis': 326214,
 'day': 88329,
 'work': 366818,
 'junior': 182258,
 'staff': 317435,
 'worker': 3682

In [10]:
#Functions imported from preprocessing data
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words = set(stopwords.words('english'))
def rm_stopwords(tokens):
    return [i for i in tokens if i not in stop_words and i]
ps = PorterStemmer()
def stem_words(tokens):
    return [ps.stem(i) for i in tokens]

In [11]:
import re
#Function removes special characters, set every word to lower case, tokenize, remove stopwords, and stem words
def preprocess_input(input_str):
    input_str = re.sub('[^A-Za-z0-9]+', ' ', input_str)
    input_str = input_str.lower()
    input_str = input_str.split()
    input_str = rm_stopwords(input_str)
    input_str = stem_words(input_str)
    return list(input_str)


In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def generate_recommendations(input_list):
    #Convert to a string first
    input_text = " ".join(input_list)
    #Then preprocess, the preprocessor would return a list
    input_text = preprocess_input(input_text)
    #Transform the input using tfidf_vectorizer
    recommendation_tf_idf = tfidf_vectorizer.transform([str(input_text)])
    #compute cosine similarity scores between input and every job entry
    scores = cosine_similarity(recommendation_tf_idf,tfidf_job)[0]
    #Return the top 10 jobs with highest cosine similarity scores
    ind = np.argpartition(scores,-10)[-10:][::-1]
    return df_combined.iloc[ind,:]

In [27]:
#Test input
test_input=["python","R","sql","git","flask","docker"]
generate_recommendations(test_input)

Unnamed: 0,url,job_title,description_html,description,job_type,company,location,description_tokens,full_info_tokens,duplicated,min_pay,max_pay
4947,https://sg.jobsdb.com/job/Data-Engineer-63fbe1...,Data Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...","Key Skills: Python, SQL, Unix, AWS\nObjectives...",Permanent,UNISON CONSULTING PTE. LTD.,Changi,"['key', 'skill', 'python', 'sql', 'unix', 'aw'...","['data', 'engin', 'unison', 'consult', 'pte', ...",False,4000.0,5000.0
4728,https://sg.jobsdb.com/job/Software-Engineer-45...,Software Engineer (Backend/Python),"<div class=""-desktop-no-padding-top"" id=""job-d...","Salary\n$6,000 - $10,000 a month\nJob Type\nFu...",Full time,Astek Singapore Innovation Technology,Singapore,"['salari', '6', '000', '10', '000', 'month', '...","['softwar', 'engin', 'backend', 'python', 'ast...",False,6000.0,10000.0
5119,https://sg.jobsdb.com/job/Backend-Engineer-d00...,Backend Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Description\nBe a part of the robotics rev...,Permanent,KABAM PTE. LTD.,Serangoon,"['job', 'descript', 'part', 'robot', 'revolut'...","['backend', 'engin', 'kabam', 'pte', 'ltd', 'p...",False,3500.0,4500.0
799,https://www.mycareersfuture.gov.sg/job/informa...,Data Engineer - Python / SQL,<p><strong>[Order Number: 2208-62898]</strong>...,[Order Number: 2208-62898]\n\nResponsibilities...,Full Time,Good Job Creations (Singapore) Pte. Ltd.,Singapore,"['order', 'number', '2208', '62898', 'respons'...","['data', 'engin', 'python', 'sql', 'good', 'jo...",False,5000.0,8000.0
2707,https://sg.jobsdb.com/job/Software-Engineer-35...,Software Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Description\n\nEngage in feasibility analy...,Full time,Temp-Team Pte Ltd,Singapore,"['job', 'descript', 'engag', 'feasibl', 'analy...","['softwar', 'engin', 'temp', 'team', 'pte', 'l...",False,4500.0,6000.0
1726,https://sg.jobsdb.com/job/Sql-Data-Engineer-c0...,Python / sql data engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",\nPython/SQL Data Engineer Avensys is a repute...,,Avensys Consulting,Singapore,"['python', 'sql', 'data', 'engin', 'avensi', '...","['python', 'sql', 'data', 'engin', 'avensi', '...",False,,
1976,https://sg.jobsdb.com/job/Senior-Software-Engi...,Senior Software Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Description\n\nResearch solutions for tech...,Full time,Elitez,Kallang,"['job', 'descript', 'research', 'solut', 'tech...","['senior', 'softwar', 'engin', 'elitez', 'full...",False,7000.0,11000.0
4227,https://sg.jobsdb.com/job/Developer-9af02f230f...,Full Stack Developer (ART-652),"<div class=""-desktop-no-padding-top"" id=""job-d...",\nDevelop and support GIC's Corporate Services...,Permanent,FPT Asia Pacific Pte Ltd,Singapore,"['develop', 'support', 'gic', 'corpor', 'servi...","['full', 'stack', 'develop', 'art', '652', 'fp...",False,6000.0,7500.0
5817,https://sg.jobsdb.com/job/Software-Engineer-c0...,Software Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",The Software Engineer position is based in Sin...,Full time,Kiteworks ¬Æ,Bedok,"['softwar', 'engin', 'posit', 'base', 'singapo...","['softwar', 'engin', 'kitework', 'full', 'time...",False,,
4866,https://sg.jobsdb.com/job/Research-Engineer-83...,Research Engineer - (Natural Language Processing),"<div class=""-desktop-no-padding-top"" id=""job-d...",\nJob Responsibilities\nDevelopment and evalua...,Full time,Nanyang Technological University,Singapore,"['job', 'respons', 'develop', 'evalu', 'natur'...","['research', 'engin', 'natur', 'languag', 'pro...",False,,


In [27]:
#save tf-idf matrix and vectorizer
import pickle
pickle.dump(tfidf_vectorizer,open('tfidf_vectorizer.pkl','wb'))
pickle.dump(tfidf_job,open('tfidf_job.pkl','wb'))