In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import gensim
from gensim import models
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import ast
import nltk
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from sklearn.metrics.pairwise import cosine_similarity

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


In [2]:
# Download pre-trained Word2Vec model that is previously trained with Google News
!wget --load-cookies /tmp/cookies.txt "https://drive.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM" -O GoogleNews-vectors-negative300.bin.gz && rm -rf /tmp/cookies.txt
!gunzip GoogleNews-vectors-negative300.bin.gz

--2022-10-19 05:28:15--  https://drive.google.com/uc?export=download&confirm=t&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
Resolving drive.google.com (drive.google.com)... 74.125.137.139, 74.125.137.113, 74.125.137.138, ...
Connecting to drive.google.com (drive.google.com)|74.125.137.139|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0g-8s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/tqrfbp5u8c08401utsso0532fp5cgq3p/1666157250000/06848720943842814915/*/0B7XkCwpI5KDYNlNUTTlSS21pQmM?e=download&uuid=e4bb71e3-0123-493a-8adc-68d27461c0cb [following]
--2022-10-19 05:28:15--  https://doc-0g-8s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/tqrfbp5u8c08401utsso0532fp5cgq3p/1666157250000/06848720943842814915/*/0B7XkCwpI5KDYNlNUTTlSS21pQmM?e=download&uuid=e4bb71e3-0123-493a-8adc-68d27461c0cb
Resolving doc-0g-8s-docs.googleusercontent.com (doc-0g-8s-docs.googleusercontent.com)... 142.250.141.132, 2607:f8b0

In [4]:
# Load the pretrained model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
# Read in scraped job listings csv and reformat the full_info_tokens from string to list
df = pd.read_csv('/content/drive/My Drive/NUS/Y3S1/DSA3101/processed_data.csv')
df['full_info_tokens'] = df.full_info_tokens.apply(func = ast.literal_eval)

In [6]:
# Build a corpus for full description of the jobs using the full_info_token column
corpus = []
for tokens in df['full_info_tokens']:
  corpus.append(tokens)

In [8]:
# Create a Word2Vec model for this job context
# Feed in the model the tokens from the full_info_description column
# Obtain word embeddings that are present in the pre-trained model using the intersect function, and then further train this Word2Vec model
w2v_model = Word2Vec(size = 300, window=5, min_count = 2)
w2v_model.build_vocab(corpus)
w2v_model.intersect_word2vec_format('GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)
w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs = 5)

(6898219, 7716420)

In [9]:
# Obtain word embeddings for all the job descriptions
jobs_w2v = []
for line in df['full_info_tokens']: # Iterating through every job description
  avgw2v = None
  count = 0
  for word in line:
    if word in w2v_model.wv.vocab: # Extracting word embddings from the model if present
      count += 1
      if avgw2v is None:
        avgw2v = w2v_model[word]
      else:
        avgw2v = avgw2v + w2v_model[word]
  if avgw2v is not None:
    avgw2v = avgw2v / count
    jobs_w2v.append(avgw2v) # Append this particular job description to the overall job description list (jobs_w2v)

jobs_w2v = np.asarray(jobs_w2v) # Convert this array to Numpy array

  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


In [10]:
# Create functions to preprocess input
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text
  
ps = PorterStemmer()
def stem_words(text):
  text_tokens = text.split()
  arr = [ps.stem(i) for i in text_tokens]
  text = " ".join(arr)
  return text

In [11]:
# Create this function to process the users' input skill list
def preprocess_input(input):
  input = re.sub('[^A-Za-z0-9]+', ' ', input)
  input = input.lower()
  input = remove_stop_words(input)
  input = stem_words(input)
  input = input.split()
  return input # output the list of tokens

In [12]:
# Obtain the average word embeddings based on the users' input list
def get_word_embeddings(token_list):
  input_w2v = None
  count = 0
  for word in token_list:
    if word in w2v_model.wv.vocab:
      count += 1
      if input_w2v is None:
        input_w2v = w2v_model[word]
      else:
        input_w2v = input_w2v + w2v_model[word]
  if input_w2v is not None:
    input_avg_w2v = input_w2v / count
  return input_avg_w2v

In [13]:
# Output the top 10 recommendations based on the users' input list using Cosine Similarities
def get_recommendations(input_list):
  input = " ".join(input_list)
  input_tokens = preprocess_input(input)
  input_w2v = get_word_embeddings(input_tokens)
  input_w2v = input_w2v.reshape(1,-1)
  score = cosine_similarity(input_w2v, jobs_w2v)
  index = np.argpartition(score, -10)[0][-10:]
  return df.iloc[index, :]

In [14]:
# Test
input = ["python","R","sql","git","flask","docker"]
get_recommendations(input)

  
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,url,job_title,description_html,description,job_type,company,location,description_tokens,full_info_tokens,duplicated,min_pay,max_pay
3223,https://sg.jobsdb.com/job/Data-Engineer-0e8ed9...,Data Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Title : Data Engineer\r\nJob description (...,Contract,U3 Infotech Pte Ltd,Singapore,"['job', 'titl', 'data', 'engin', 'job', 'descr...","[data, engin, u3, infotech, pte, ltd, contract...",False,9000.0,14000.0
4765,https://sg.jobsdb.com/job/Data-Engineer-095d7d...,Data engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",Requirements\r\n\r\nSr Bigdata Engineer with a...,Full time,OX CONSULTANCY PTE. LTD.,Woodlands Industrial Park,"['requir', 'sr', 'bigdata', 'engin', 'atleast'...","[data, engin, ox, consult, pte, ltd, full, tim...",False,7000.0,10000.0
4088,https://sg.jobsdb.com/job/Specialist-ee53571df...,Specialist - Software Engineering,"<div class=""-desktop-no-padding-top"" id=""job-d...","6+ years of experience in Oracle, Dev ops\r\n•...",Full time,Larsen & Toubro,Changi,"['6', 'year', 'experi', 'oracl', 'dev', 'op', ...","[specialist, softwar, engin, larsen, toubro, f...",False,6000.0,9000.0
8,https://www.mycareersfuture.gov.sg/job/banking...,Software Developer (Full Stack),<p>Job Requirements:</p>\r\n<p><br></p>\r\n<p>...,Job Requirements:\r\n\r\nKnowledge of JavaScri...,"Contract, Full Time, Flexi-work",Tangspac Consulting Pte Ltd,"THE OCTAGON, 105 CECIL STREET 069534","['job', 'requir', 'knowledg', 'javascript', 'w...","[softwar, develop, full, stack, tangspac, cons...",False,3000.0,4500.0
194,https://www.mycareersfuture.gov.sg/job/enginee...,9156 - Web Application Developer (Java) [Softw...,<p><strong>Web Application Developer (Java)</s...,Web Application Developer (Java)\r\n\r\n$4000 ...,Full Time,The Supreme Hr Advisory Pte. Ltd.,"SHENTON HOUSE, 3 SHENTON WAY 068805","['web', 'applic', 'develop', 'java', '4000', '...","[9156, web, applic, develop, java, softwar, co...",False,4000.0,8000.0
934,https://www.mycareersfuture.gov.sg/job/banking...,2183 - Application Support Engineer,<p><u><strong>Qualifications and Profile</stro...,Qualifications and Profile\r\nThe candidate sh...,"Contract, Full Time",Aspire Nxt Pte. Ltd.,"SUNTEC TOWER TWO, 9 TEMASEK BOULEVARD 038989","['qualif', 'profil', 'candid', '3', 'year', 'e...","[2183, applic, support, engin, aspir, nxt, pte...",False,5500.0,7500.0
4721,https://sg.jobsdb.com/job/Data-Base-Administra...,Data Base Administrator,"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Description:\r\nExperienced Oracle/MySQL D...,Contract,Horizon Software,Singapore,"['job', 'descript', 'experienc', 'oracl', 'mys...","[data, base, administr, horizon, softwar, cont...",False,6500.0,10000.0
6214,https://sg.jobsdb.com/job/Software-Engineer-59...,Software Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...",1.1 Critical Requirements\r\n(a) Web Applicati...,Contract,TECHEMERGE SOLUTIONS PTE. LTD.,Singapore,"['1', '1', 'critic', 'requir', 'web', 'applic'...","[softwar, engin, techemerg, solut, pte, ltd, c...",False,5000.0,7000.0
4947,https://sg.jobsdb.com/job/Data-Engineer-63fbe1...,Data Engineer,"<div class=""-desktop-no-padding-top"" id=""job-d...","Key Skills: Python, SQL, Unix, AWS\r\nObjectiv...",Permanent,UNISON CONSULTING PTE. LTD.,Changi,"['key', 'skill', 'python', 'sql', 'unix', 'aw'...","[data, engin, unison, consult, pte, ltd, perma...",False,4000.0,5000.0
4968,https://sg.jobsdb.com/job/Software-Developer-c...,Software Developer (Full Stack),"<div class=""-desktop-no-padding-top"" id=""job-d...",Job Requirements:\r\nKnowledge of JavaScript a...,Contract,Tangspac Consulting Pte Ltd,Central Singapore,"['job', 'requir', 'knowledg', 'javascript', 'w...","[softwar, develop, full, stack, tangspac, cons...",False,4000.0,4600.0
