## Import packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Load dataset

In [17]:
#movies = pd.read_csv('./dataset/imbd/movies_metadata.csv')
jobs = pd.read_csv('./dataset/dice_com-job_us_sample.csv')

## TfidfVectorizer - Convert to Numerical

In [20]:
tfidf = TfidfVectorizer(stop_words='english')
jobs['jobdescription'] = jobs['jobdescription'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
overview_matrix = tfidf.fit_transform(jobs['jobdescription'])

#Output the shape of tfidf_matrix
overview_matrix.shape

#Every jobs description has 117417 number of features (words )

(22000, 117417)

## Building the similarity matrix

In [21]:
similarity_matrix = linear_kernel(overview_matrix,overview_matrix)

In [22]:
similarity_matrix

array([[1.        , 0.05712901, 0.08229116, ..., 0.16367784, 0.12781094,
        0.03852916],
       [0.05712901, 1.        , 0.04510606, ..., 0.08193473, 0.06210229,
        0.02303946],
       [0.08229116, 0.04510606, 1.        , ..., 0.13467453, 0.07733971,
        0.01152106],
       ...,
       [0.16367784, 0.08193473, 0.13467453, ..., 1.        , 0.11182536,
        0.02333776],
       [0.12781094, 0.06210229, 0.07733971, ..., 0.11182536, 1.        ,
        0.02400144],
       [0.03852916, 0.02303946, 0.01152106, ..., 0.02333776, 0.02400144,
        1.        ]])

## Movies index mapping

In [38]:
mapping = pd.Series(jobs.index,index = jobs['uniq_id'])
mapping

uniq_id
418ff92580b270ef4e7c14f0ddfc36b4        0
8aec88cba08d53da65ab99cf20f6f9d9        1
46baa1f69ac07779274bcd90b85d9a72        2
3941b2f206ae0f900c4fba4ac0b18719        3
45efa1f6bc65acc32bbbb953a1ed13b7        4
                                    ...  
86e27ce6b7e631e55d69d142c7d43df2    21995
4287c7ee3317ccf1edd76e238cf8e584    21996
d7512f0181d69f83f96db38cd77a4d08    21997
ec375268b494b3bcbed1635d64226112    21998
9a4e8c27f74af4c0d2f6efbd420a8a91    21999
Length: 22000, dtype: int64

## Building Recommender System

In [72]:
def recommend_jobs_based_on_description(job_input):
    
    job_index = mapping[job_input]
    #get similarity values with other movies
    
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(similarity_matrix[job_index]))
    
    #sort in descending order the similarity score of movie inputted with all the other movies
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies. Ignore the first movie.
    similarity_score = similarity_score[1:10]
    
    #return movie names using the mapping series
    job_indices = [i[0] for i in similarity_score]
    
    return (jobs['jobtitle'].iloc[job_indices] + ' @ ' + jobs['company'].iloc[job_indices])

In [71]:
recommend_jobs_based_on_description('418ff92580b270ef4e7c14f0ddfc36b4')

Input Job:  0    AUTOMATION TEST ENGINEER
Name: jobtitle, dtype: object


19680    Websphere Commerce Developer (Locals to MI) @ ...
8030     WebSphere-MQ Administrator @ Randstad Technolo...
18191    Java multiple positions $75K to $130K + bonus ...
6164              SDET @ Paramount Software Solutions, Inc
12420             SDET @ Paramount Software Solutions, Inc
6297     WebSphere Message Queue (WMQ) admin / WebSpher...
16990       MDMS (Meter Data Management System) @ Startekk
9601     Websphere Commerce Applications Architect @ Th...
10633                   WebSphere Admin @ SV Professionals
dtype: object