In [1]:
import pandas as pd

In [2]:
#Read Dataset
df_combined = pd.read_csv("../../Datasets/processed_data.csv")

In [5]:
df_combined.shape

(6243, 11)

In [6]:
#Overview of dataset
df_combined.head()

Unnamed: 0,url,job_title,description_html,description,pay_range,job_type,company,location,description_tokens,full_info_tokens,duplicated
0,https://www.mycareersfuture.gov.sg/job/custome...,PRODUCTION CONTROL MANAGER,<p><strong>JOB DESCIPTION</strong></p>\n<ul>\n...,JOB DESCIPTION\n\n planning and organising pr...,"$2,000 - $3,400","Permanent, Full Time",Snl Logistics Pte Ltd,31 GUL CIRCLE 629569,"['job', 'descipt', 'plan', 'organis', 'product...","['product', 'control', 'manag', 'job', 'descip...",False
1,https://www.mycareersfuture.gov.sg/job/enginee...,Design Engineer ( Mechanical / Electrical),<p><strong>SUMMARY</strong></p>\n<ul>\n <li>T...,SUMMARY\n\n This position is responsible...,"$2,500 - $4,500",Full Time,Jamco Aero Design &Amp; Engineering Private Li...,,"['summari', 'posit', 'respons', 'support', 'pr...","['design', 'engin', 'mechan', 'electr', 'summa...",False
2,https://www.mycareersfuture.gov.sg/job/sales/b...,Business Development Executive,<p><strong>Job description</strong></p>\n<p>Wh...,Job description\nWho we are:\nWe are a logisti...,"$3,200 - $3,500","Part Time, Permanent",Airpak Express Pte Ltd,"TECHPLAS INDUSTRIAL BUILDING, 45 CHANGI SOUTH ...","['job', 'descript', 'logist', 'servic', 'provi...","['busi', 'develop', 'execut', 'job', 'descript...",False
3,https://www.mycareersfuture.gov.sg/job/banking...,Senior / Data Scientist,<p>The ideal candidate should have a good unde...,The ideal candidate should have a good underst...,"$9,000 - $14,000","Permanent, Full Time",Singapore Exchange Limited,"SGX CENTRE I, 2 SHENTON WAY 068804","['ideal', 'candid', 'good', 'understand', 'bus...","['senior', 'data', 'scientist', 'ideal', 'cand...",False
4,https://www.mycareersfuture.gov.sg/job/archite...,8890-Sales Consultant [ Digital Software| Saas...,<p><strong>Sales Consultant (Digital Software)...,Sales Consultant (Digital Software)\nLocation:...,"$3,000 - $4,500","Permanent, Full Time",The Supreme Hr Advisory Pte. Ltd.,"SHENTON HOUSE, 3 SHENTON WAY 068805","['sale', 'consult', 'digit', 'softwar', 'locat...","['8890', 'sale', 'consult', 'digit', 'softwar'...",False


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Set ngram range to 2 to include phrases
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [8]:
#Fit full_info_tokens onto the tfidf vectorizer
tfidf_job = tfidf_vectorizer.fit_transform(df_combined['full_info_tokens'])

In [9]:
#Check if tfidf_vectorizer worked correctly
tfidf_vectorizer.vocabulary_

{'product': 256779,
 'control': 75442,
 'manag': 199474,
 'job': 179027,
 'descipt': 93362,
 'plan': 243525,
 'organis': 230745,
 'schedul': 293705,
 'assess': 30745,
 'project': 260458,
 'resourc': 282838,
 'requir': 280306,
 'estim': 119370,
 'negoti': 219145,
 'agre': 16032,
 'budget': 45753,
 'timelin': 339248,
 'client': 58992,
 'ensur': 114869,
 'health': 153032,
 'safeti': 291085,
 'regul': 275660,
 'met': 208243,
 'determin': 95717,
 'qualiti': 266785,
 'standard': 316626,
 'overse': 233061,
 'process': 255382,
 'select': 298107,
 'order': 229842,
 'purchas': 264780,
 'materi': 204138,
 'repair': 278694,
 'routin': 289869,
 'mainten': 198235,
 'equip': 117528,
 'liais': 190896,
 'buyer': 48058,
 'supervis': 324315,
 'day': 87139,
 'work': 364678,
 'junior': 180897,
 'staff': 315544,
 'worker': 366159,
 'factori': 129929,
 'environ': 116494,
 'relev': 277456,
 'train': 343004,
 'session': 301090,
 'goal': 146422,
 'within': 363761,
 'approv': 27425,
 'min': 210466,
 'diploma': 9

In [10]:
#Functions imported from preprocessing data
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words = set(stopwords.words('english'))
def rm_stopwords(tokens):
    return [i for i in tokens if i not in stop_words and i]
ps = PorterStemmer()
def stem_words(tokens):
    return [ps.stem(i) for i in tokens]

In [11]:
import re
#Function removes special characters, set every word to lower case, tokenize, remove stopwords, and stem words
def preprocess_input(input_str):
    input_str = re.sub('[^A-Za-z0-9]+', ' ', input_str)
    input_str = input_str.lower()
    input_str = input_str.split()
    input_str = rm_stopwords(input_str)
    input_str = stem_words(input_str)
    return list(input_str)


In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def generate_recommendations(input_list):
    #Convert to a string first
    input_text = " ".join(input_list)
    #Then preprocess, the preprocessor would return a list
    input_text = preprocess_input(input_text)
    #Transform the input using tfidf_vectorizer
    recommendation_tf_idf = tfidf_vectorizer.transform([str(input_text)])
    #compute cosine similarity scores between input and every job entry
    scores = cosine_similarity(recommendation_tf_idf,tfidf_job)[0]
    #Return the top 10 jobs with highest cosine similarity scores
    ind = np.argpartition(scores,-10)[-10:][::-1]
    return df_combined.iloc[ind,:]

In [26]:
#Test input
test_input=["Computer Science","Information Technology","software development","project management","design document","Software product specifications","architectural styles"]
generate_recommendations(test_input)

Unnamed: 0,url,job_title,description_html,description,pay_range,job_type,company,location,description_tokens,full_info_tokens,duplicated
4491,https://sg.jobsdb.com/job/Software-Architect-1...,7506 - Software Architect,"<div class=""-desktop-no-padding-top"" id=""job-d...",Location: Ubi\nWorking Days: 5 days work week\...,"$4,000 - $6,000",Permanent,The Supreme HR Advisory,Central Singapore,"['locat', 'ubi', 'work', 'day', '5', 'day', 'w...","['7506', 'softwar', 'architect', 'locat', 'ubi...",False
3340,https://sg.jobsdb.com/job/6723-software-Archit...,6723-Software Architect [Macpherson],"<div class=""-desktop-no-padding-top"" id=""job-d...","If you are interested to apply, kindly WhatsAp...","$5,000 - $6,000",Permanent,The Supreme HR Advisory,Singapore,"['interest', 'appli', 'kindli', 'whatsapp', 'u...","['6723', 'softwar', 'architect', 'macpherson',...",False
2112,https://sg.jobsdb.com/job/Software-Architect-6...,Software Architect @ Ubi - 5Days //-3288,"<div class=""-desktop-no-padding-top"" id=""job-d...",Software Architect\nLocation: Ubi\nWorking Day...,"$4,500 - $6,000",Permanent,The Supreme HR Advisory,Central Singapore,"['softwar', 'architect', 'locat', 'ubi', 'work...","['softwar', 'architect', 'ubi', '5day', '3288'...",False
2454,https://sg.jobsdb.com/job/Software-Architect-d...,Software Architect,"<div class=""-desktop-no-padding-top"" id=""job-d...",To create printer utility to support the use w...,,,SATO Global Business Services Pte. Ltd,Geylang,"['creat', 'printer', 'util', 'support', 'use',...","['softwar', 'architect', 'creat', 'printer', '...",False
5152,https://sg.jobsdb.com/job/Software-Architect-3...,2697 - Software Architect [Macpherson],"<div class=""-desktop-no-padding-top"" id=""job-d...",Responsibilities:\n\nCollaborate with various ...,"$5,000 - $6,000",Permanent,The Supreme HR Advisory,Singapore,"['respons', 'collabor', 'variou', 'stakehold',...","['2697', 'softwar', 'architect', 'macpherson',...",False
1216,https://sg.jobsdb.com/job/Software-Engineer-c3...,Software Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...","Azbil Singapore Pte Ltd, was, established sinc...","$3,300 - $6,600",Permanent,Azbil Corporation Singapore Branch,West Singapore,"['azbil', 'singapor', 'pte', 'ltd', 'establish...","['softwar', 'engin', 'azbil', 'singapor', 'pte...",False
1520,https://sg.jobsdb.com/job/Software-Engineer-43...,Software Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Description\n\n\nExecute full software dev...,,,SATO Asia Pacific Pte Ltd,Geylang,"['job', 'descript', 'execut', 'full', 'softwar...","['softwar', 'engin', 'job', 'descript', 'execu...",False
726,https://www.mycareersfuture.gov.sg/job/informa...,Senior / Software Engineer,<p><strong>Responsibilities:</strong></p>\n<ul...,"Responsibilities:\n\n Develop, maintain, and ...","$4,000 - $6,500",Full Time,St Engineering Mission Software &Amp; Services...,"ST ENGINEERING HUB, 1 ANG MO KIO ELECTRONICS P...","['respons', 'develop', 'maintain', 'enhanc', '...","['senior', 'softwar', 'engin', 'respons', 'dev...",False
1523,https://sg.jobsdb.com/job/.net-Software-Develo...,.Net Software Developer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Responsibilities:\n\nDevelops and designs new ...,,Full time,Talent Trader Group,Singapore,"['respons', 'develop', 'design', 'new', 'softw...","['net', 'softwar', 'develop', 'respons', 'deve...",False
424,https://www.mycareersfuture.gov.sg/job/informa...,Software Developer || Singaporeans Only,<p><u><strong>Job Responsibilities</strong></u...,Job Responsibilities\n\n Be part of the softw...,"$2,800 - $4,300",Full Time,Apba Tg Human Resource Pte. Ltd.,,"['job', 'respons', 'part', 'softwar', 'team', ...","['softwar', 'develop', 'singaporean', 'job', '...",False


In [27]:
#save tf-idf matrix and vectorizer
import pickle
pickle.dump(tfidf_vectorizer,open('tfidf_vectorizer.pkl','wb'))
pickle.dump(tfidf_job,open('tfidf_job.pkl','wb'))