In [1]:
import pandas as pd

In [2]:
#Read Dataset
df_combined = pd.read_csv("../../Datasets/processed_data.csv")

In [3]:
df_combined.shape

(6243, 14)

In [4]:
df_combined.isna().sum()

url                      0
job_title                0
description_html         0
description              0
job_type               459
company                  0
location                 0
description_tokens       0
description_clean        0
full_info_tokens         0
full_info_clean          0
duplicated               0
min_pay               3580
max_pay               3580
dtype: int64

In [None]:
#Overview of dataset
df_combined.head()

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
#Create a TF-IDF vectorizer
#Set ngram range to 2 to include phrases of length 2
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [7]:
#Fit full_info_tokens onto the tfidf vectorizer
#full_info_tokens is also transformed into a sparse matrix
tfidf_job = tfidf_vectorizer.fit_transform(df_combined['full_info_tokens'])

In [None]:
#Check if tfidf_vectorizer worked correctly
tfidf_vectorizer.vocabulary_

In [9]:
#Functions imported from preprocessing the datasets
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words = set(stopwords.words('english'))
def rm_stopwords(tokens):
    return [i for i in tokens if i not in stop_words and i]
ps = PorterStemmer()
def stem_words(tokens):
    return [ps.stem(i) for i in tokens]

In [10]:
import re
#Function removes special characters, set every word to lower case, tokenize, remove stopwords, and stem words
def preprocess_input(input_str):
    input_str = re.sub('[^A-Za-z0-9]+', ' ', input_str)
    input_str = input_str.lower()
    input_str = input_str.split()
    input_str = rm_stopwords(input_str)
    input_str = stem_words(input_str)
    return list(input_str)


In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
#function takes a list of input preferences and generates a dataframe of recommendations
def generate_recommendations(input_list):
    #Convert to a string first
    input_text = " ".join(input_list)
    #Then preprocess, the preprocessor would return a list
    input_text = preprocess_input(input_text)
    #Transform the input using tfidf_vectorizer
    recommendation_tf_idf = tfidf_vectorizer.transform([str(input_text)])
    #compute cosine similarity scores between input and every job entry
    scores = cosine_similarity(recommendation_tf_idf,tfidf_job)[0]
    #Return the top 10 jobs with highest cosine similarity scores
    ind = np.argpartition(scores,-10)[-10:][::-1]
    return df_combined.iloc[ind,:]

In [None]:
#Test input
test_input=["python","R","sql","git","flask","docker"]
generate_recommendations(test_input)

In [13]:
#save tf-idf matrix and vectorizer
import pickle
pickle.dump(tfidf_vectorizer,open('tfidf_vectorizer.pkl','wb'))
pickle.dump(tfidf_job,open('tfidf_job.pkl','wb'))