In [1]:
import pandas as pd

In [2]:
#Read Dataset
df_combined = pd.read_csv("../../Datasets/processed_data.csv")

In [3]:
df_combined.shape

(6243, 14)

In [4]:
df_combined.isna().sum()

url                      0
job_title                0
description_html         0
description              0
job_type               459
company                  0
location                 0
description_tokens       0
description_clean        0
full_info_tokens         0
full_info_clean          0
duplicated               0
min_pay               3580
max_pay               3580
dtype: int64

In [None]:
#Overview of dataset
df_combined.head()

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Create a TF-IDF vectorizer
#Set ngram range to 2 to include phrases of length 2
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [7]:
#Fit full_info_tokens onto the tfidf vectorizer
#full_info_tokens is also transformed into a sparse matrix
tfidf_job = tfidf_vectorizer.fit_transform(df_combined['full_info_tokens'])

In [None]:
#Check if tfidf_vectorizer worked correctly
tfidf_vectorizer.vocabulary_

In [9]:
#Functions imported from preprocessing the datasets
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words = set(stopwords.words('english'))
def rm_stopwords(tokens):
    return [i for i in tokens if i not in stop_words and i]
ps = PorterStemmer()
def stem_words(tokens):
    return [ps.stem(i) for i in tokens]

In [10]:
import re
#Function removes special characters, set every word to lower case, tokenize, remove stopwords, and stem words
def preprocess_input(input_str):
    input_str = re.sub('[^A-Za-z0-9]+', ' ', input_str)
    input_str = input_str.lower()
    input_str = input_str.split()
    input_str = rm_stopwords(input_str)
    input_str = stem_words(input_str)
    return list(input_str)


In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
#function takes a list of input preferences and generates a dataframe of recommendations
def generate_recommendations(input_list):
    #Convert to a string first
    input_text = " ".join(input_list)
    #Then preprocess, the preprocessor would return a list
    input_text = preprocess_input(input_text)
    #Transform the input using tfidf_vectorizer
    recommendation_tf_idf = tfidf_vectorizer.transform([str(input_text)])
    #compute cosine similarity scores between input and every job entry
    scores = cosine_similarity(recommendation_tf_idf,tfidf_job)[0]
    #Return the top 10 jobs with highest cosine similarity scores
    ind = np.argpartition(scores,-10)[-10:][::-1]
    return df_combined.iloc[ind,:]

In [12]:
#Test input
test_input=["python","R","sql","git","flask","docker"]
generate_recommendations(test_input)

Unnamed: 0,url,job_title,description_html,description,job_type,company,location,description_tokens,description_clean,full_info_tokens,full_info_clean,duplicated,min_pay,max_pay
4947,https://sg.jobsdb.com/job/Data-Engineer-63fbe1...,Data Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...","Key Skills: Python, SQL, Unix, AWS\nObjectives...",Permanent,UNISON CONSULTING PTE. LTD.,Changi,"['key', 'skill', 'python', 'sql', 'unix', 'aw'...",key skills python sql unix aws objectives team...,"['data', 'engin', 'unison', 'consult', 'pte', ...",data engineer unison consulting pte ltd perman...,False,4000.0,5000.0
4728,https://sg.jobsdb.com/job/Software-Engineer-45...,Software Engineer (Backend/Python),"<div class=""-desktop-no-padding-top"" id=""job-d...","Salary\n$6,000 - $10,000 a month\nJob Type\nFu...",Full time,Astek Singapore Innovation Technology,Singapore,"['salari', '6', '000', '10', '000', 'month', '...",salary 6 000 10 000 month job type full time f...,"['softwar', 'engin', 'backend', 'python', 'ast...",software engineer backend python astek singapo...,False,6000.0,10000.0
5119,https://sg.jobsdb.com/job/Backend-Engineer-d00...,Backend Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Description\nBe a part of the robotics rev...,Permanent,KABAM PTE. LTD.,Serangoon,"['job', 'descript', 'part', 'robot', 'revolut'...",job description part robotics revolution dutie...,"['backend', 'engin', 'kabam', 'pte', 'ltd', 'p...",backend engineer kabam pte ltd permanent seran...,False,3500.0,4500.0
799,https://www.mycareersfuture.gov.sg/job/informa...,Data Engineer - Python / SQL,<p><strong>[Order Number: 2208-62898]</strong>...,[Order Number: 2208-62898]\n\nResponsibilities...,Full Time,Good Job Creations (Singapore) Pte. Ltd.,Singapore,"['order', 'number', '2208', '62898', 'respons'...",order number 2208 62898 responsibilities suppo...,"['data', 'engin', 'python', 'sql', 'good', 'jo...",data engineer python sql good job creations si...,False,5000.0,8000.0
2707,https://sg.jobsdb.com/job/Software-Engineer-35...,Software Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Description\n\nEngage in feasibility analy...,Full time,Temp-Team Pte Ltd,Singapore,"['job', 'descript', 'engag', 'feasibl', 'analy...",job description engage feasibility analysis ne...,"['softwar', 'engin', 'temp', 'team', 'pte', 'l...",software engineer temp team pte ltd full time ...,False,4500.0,6000.0
1726,https://sg.jobsdb.com/job/Sql-Data-Engineer-c0...,Python / sql data engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",\nPython/SQL Data Engineer Avensys is a repute...,,Avensys Consulting,Singapore,"['python', 'sql', 'data', 'engin', 'avensi', '...",python sql data engineer avensys reputed globa...,"['python', 'sql', 'data', 'engin', 'avensi', '...",python sql data engineer avensys consulting si...,False,,
1976,https://sg.jobsdb.com/job/Senior-Software-Engi...,Senior Software Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Description\n\nResearch solutions for tech...,Full time,Elitez,Kallang,"['job', 'descript', 'research', 'solut', 'tech...",job description research solutions technically...,"['senior', 'softwar', 'engin', 'elitez', 'full...",senior software engineer elitez full time kall...,False,7000.0,11000.0
4227,https://sg.jobsdb.com/job/Developer-9af02f230f...,Full Stack Developer (ART-652),"<div class=""-desktop-no-padding-top"" id=""job-d...",\nDevelop and support GIC's Corporate Services...,Permanent,FPT Asia Pacific Pte Ltd,Singapore,"['develop', 'support', 'gic', 'corpor', 'servi...",develop support gic corporate services systems...,"['full', 'stack', 'develop', 'art', '652', 'fp...",full stack developer art 652 fpt asia pacific ...,False,6000.0,7500.0
5817,https://sg.jobsdb.com/job/Software-Engineer-c0...,Software Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",The Software Engineer position is based in Sin...,Full time,Kiteworks ¬Æ,Bedok,"['softwar', 'engin', 'posit', 'base', 'singapo...",software engineer position based singapore loo...,"['softwar', 'engin', 'kitework', 'full', 'time...",software engineer kiteworks full time bedok so...,False,,
4866,https://sg.jobsdb.com/job/Research-Engineer-83...,Research Engineer - (Natural Language Processing),"<div class=""-desktop-no-padding-top"" id=""job-d...",\nJob Responsibilities\nDevelopment and evalua...,Full time,Nanyang Technological University,Singapore,"['job', 'respons', 'develop', 'evalu', 'natur'...",job responsibilities development evaluation na...,"['research', 'engin', 'natur', 'languag', 'pro...",research engineer natural language processing ...,False,,


In [13]:
#save tf-idf matrix and vectorizer
import pickle
pickle.dump(tfidf_vectorizer,open('tfidf_vectorizer.pkl','wb'))
pickle.dump(tfidf_job,open('tfidf_job.pkl','wb'))