## Import packages

In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy import spatial

## Load dataset

In [2]:
#movies = pd.read_csv('./dataset/imbd/movies_metadata.csv')
jobs = pd.read_csv('./dataset/dice_com-job_us_sample.csv')

# Job Recommendations based on Job Descriptions:

## TfidfVectorizer - Convert to Numerical

In [3]:
tfidf = TfidfVectorizer(stop_words='english')
jobs['jobdescription'] = jobs['jobdescription'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
overview_matrix = tfidf.fit_transform(jobs['jobdescription'])

#Output the shape of tfidf_matrix
overview_matrix.shape

#Every jobs description has 117417 number of features (words )

(22000, 117417)

## Building the similarity matrix

In [4]:
similarity_matrix = linear_kernel(overview_matrix,overview_matrix)

In [5]:
similarity_matrix

array([[1.        , 0.05712901, 0.08229116, ..., 0.16367784, 0.12781094,
        0.03852916],
       [0.05712901, 1.        , 0.04510606, ..., 0.08193473, 0.06210229,
        0.02303946],
       [0.08229116, 0.04510606, 1.        , ..., 0.13467453, 0.07733971,
        0.01152106],
       ...,
       [0.16367784, 0.08193473, 0.13467453, ..., 1.        , 0.11182536,
        0.02333776],
       [0.12781094, 0.06210229, 0.07733971, ..., 0.11182536, 1.        ,
        0.02400144],
       [0.03852916, 0.02303946, 0.01152106, ..., 0.02333776, 0.02400144,
        1.        ]])

## Jobs index mapping

In [6]:
mapping = pd.Series(jobs.index,index = jobs['uniq_id'])
mapping

uniq_id
418ff92580b270ef4e7c14f0ddfc36b4        0
8aec88cba08d53da65ab99cf20f6f9d9        1
46baa1f69ac07779274bcd90b85d9a72        2
3941b2f206ae0f900c4fba4ac0b18719        3
45efa1f6bc65acc32bbbb953a1ed13b7        4
                                    ...  
86e27ce6b7e631e55d69d142c7d43df2    21995
4287c7ee3317ccf1edd76e238cf8e584    21996
d7512f0181d69f83f96db38cd77a4d08    21997
ec375268b494b3bcbed1635d64226112    21998
9a4e8c27f74af4c0d2f6efbd420a8a91    21999
Length: 22000, dtype: int64

## Building Recommender System

In [7]:
def recommend_jobs_based_on_description(job_input):
    
    job_index = mapping[job_input]
    #get similarity values with other movies
    
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(similarity_matrix[job_index]))
    #print(similarity_matrix[job_index])
    
    #sort in descending order the similarity score of movie inputted with all the other movies
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies. Ignore the first movie.
    similarity_score = similarity_score[1:10]
    
    #return movie names using the mapping series
    job_indices = [i[0] for i in similarity_score]
    
    return (jobs['jobtitle'].iloc[job_indices] + ' @ ' + jobs['company'].iloc[job_indices])

In [8]:
recommend_jobs_based_on_description('418ff92580b270ef4e7c14f0ddfc36b4')

19680    Websphere Commerce Developer (Locals to MI) @ ...
8030     WebSphere-MQ Administrator @ Randstad Technolo...
18191    Java multiple positions $75K to $130K + bonus ...
6164              SDET @ Paramount Software Solutions, Inc
12420             SDET @ Paramount Software Solutions, Inc
6297     WebSphere Message Queue (WMQ) admin / WebSpher...
16990       MDMS (Meter Data Management System) @ Startekk
9601     Websphere Commerce Applications Architect @ Th...
10633                   WebSphere Admin @ SV Professionals
dtype: object

## Using the combined dataset

In [16]:
def cosine_similarity(arr1,arr2):
    ans=1- spatial.distance.cosine(arr1,arr2)
    if(np.isnan(ans)):
        return 0
    else:
        return ans
class job_postings:    
    def __init__(self,link):
        self.df2=pd.read_csv(link)
        self.training_range=int(len(self.df2.loc[:,'uniq_id']))
    def match_profile(self,input_path,user_id,flag=0):
        #Match a given user_id with all jobs in the database
        
        #Check if user id exists
        df=pd.read_csv(input_path+"domain_user_profile.csv",index_col='Respondent')
        #print(df.columns)
        matches=dict()
        if(flag==0):
            if(user_id in df.index):
                userdomain=df.loc[user_id,:]
                #print(userdomain)
                #If it does, retrieve the user profile from input_path
                df=pd.read_csv(input_path+"languages_profile_user.csv",index_col='Respondent')
                df.drop(['bash'], axis=1,inplace = True)
                userlanguages=df.loc[user_id,:]

                df=pd.read_csv(input_path+"frameworks_profile_user.csv",index_col='Respondent')
                userframeworks=df.loc[user_id,:]

                df=pd.read_csv(input_path+"platforms_profile_user.csv",index_col='Respondent')
                userplatforms=df.loc[user_id,:]

                df=pd.read_csv(input_path+"databases_profile_user.csv",index_col='Respondent')
                userdatabases=df.loc[user_id,:]

                userdomain=np.asarray(userdomain.fillna(0))
                userlanguages=np.asarray(userlanguages.fillna(0))
                userframeworks=np.asarray(userframeworks.fillna(0))
                userplatforms=np.asarray(userplatforms.fillna(0))
                userdatabases=np.asarray(userdatabases.fillna(0))
                #print(userdomain)
            else:
                print("error! user id not in Dataset")
            #If it doesn't,take user profile as input
        else:

            print("New user!Enter details..")
            name=input("Enter full name")
            skills=input("Enter skills(comma separated). These are programming languages, frameworks,platforms or databases you have experience with").split(",")
            domains=''
            flag=1
            while(1):
                print("Enter domain(s) of interest separated by commas(Names are case sensitive). Should be one of the following:")
                for i in df.columns:
                    print(i,end=",")
                domains=input().split(",")
                for domain in domains:
                    if(domain not in df.columns):
                        flag=0
                        break
                if(flag==1):
                    break
                else:
                    print("Please enter valid domain")
            #domains=list(map(lambda x:x.lower(),domains))
            skills=list(map(lambda x:x.lower(),skills))                

            userdomain=pd.DataFrame(columns=df.columns)
            dictionary=dict()
            for domain in domains:
                dictionary[domain]=1.0
            userdomain=userdomain.append(dictionary,ignore_index=True)


            df=pd.read_csv(input_path+"languages_profile_user.csv",index_col='Respondent')
            userlanguages=pd.DataFrame(columns=df.columns)
            dictionary=dict()
            for skill in skills:
                if(skill in df.columns):
                    dictionary[skill]=1.0
            userlanguages=userlanguages.append(dictionary,ignore_index=True)

            df=pd.read_csv(input_path+"frameworks_profile_user.csv",index_col='Respondent')
            userframeworks=pd.DataFrame(columns=df.columns)
            dictionary=dict()
            for skill in skills:
                if(skill in df.columns):
                    dictionary[skill]=1.0
            userframeworks=userframeworks.append(dictionary,ignore_index=True)

            df=pd.read_csv(input_path+"platforms_profile_user.csv",index_col='Respondent')
            userplatforms=pd.DataFrame(columns=df.columns)                
            dictionary=dict()
            for skill in skills:
                if(skill in df.columns):
                    dictionary[skill]=1.0
            userplatforms=userplatforms.append(dictionary,ignore_index=True)

            df=pd.read_csv(input_path+"databases_profile_user.csv",index_col='Respondent')
            userdatabases=pd.DataFrame(columns=df.columns)               
            dictionary=dict()
            for skill in skills:
                if(skill in df.columns):
                    dictionary[skill]=1.0
            userdatabases=userdatabases.append(dictionary,ignore_index=True)
            #print(userdomain)
            userdomain.head()
            userdomain=np.asarray(userdomain.iloc[0,:].fillna(0))
            userlanguages=np.asarray(userlanguages.iloc[0,:].fillna(0))
            userframeworks=np.asarray(userframeworks.iloc[0,:].fillna(0))
            userplatforms=np.asarray(userplatforms.iloc[0,:].fillna(0))
            userdatabases=np.asarray(userdatabases.iloc[0,:].fillna(0))
                
        jobdomain=pd.read_csv(input_path+"domain_job_profile.csv",index_col='uniq_id')
        joblanguages=pd.read_csv(input_path+'languages_profile_job.csv',index_col='uniq_id')
        jobframeworks=pd.read_csv(input_path+'frameworks_profile_job.csv',index_col='uniq_id')
        jobplatforms=pd.read_csv(input_path+'platforms_profile_job.csv',index_col='uniq_id')
        jobdatabases=pd.read_csv(input_path+'databases_profile_job.csv',index_col='uniq_id')
        
        #print(len(jobdomain.index),len(joblanguages.index))
        for i,j in zip(jobdomain.index,joblanguages.index):
            #print(i)
            domain=jobdomain.iloc[i,:].fillna(0)
            language=joblanguages.iloc[i,:].fillna(0)
            framework=jobframeworks.iloc[i,:].fillna(0)
            platform=jobplatforms.iloc[i,:].fillna(0)
            database=jobdatabases.iloc[i,:].fillna(0)
            #print('Uniq_id: ',joblanguages['uniq_id'])
            job_id=str(j)
            
            domain=np.asarray(domain)
            language=np.asarray(language)
            framework=np.asarray(framework)
            platform=np.asarray(platform)
            database=np.asarray(database)
            
            #print(language)
            #print(userlanguages)
            
            score=(0.7*cosine_similarity(domain,userdomain))+(0.3*(cosine_similarity(language,userlanguages)+cosine_similarity(framework,userframeworks)+cosine_similarity(platform,userplatforms)+cosine_similarity(database,userdatabases)))
            matches[job_id]=score
            score=(0.7*cosine_similarity(domain,userdomain))+(0.3*(cosine_similarity(language,userlanguages)+cosine_similarity(framework,userframeworks)+cosine_similarity(platform,userplatforms)+cosine_similarity(database,userdatabases)))
            
            #Initializing job profiles for later access
            self.job_domain=domain
            self.job_language=language
            self.job_framework=framework
            self.job_platform=platform
            self.job_database=database
            
            self.user_domain=userdomain
            self.user_language=userlanguages
            self.user_framework=userframeworks
            self.user_platform=userplatforms
            self.user_database=userdatabases
        matches=sorted(matches.items(),key=lambda x:x[1],reverse=True)
        
        recommendations=matches[:5]
        print("The top 5 Recommendations for User ",user_id,"based on content-based filtering are:")
        for i in recommendations:
            print('Job Unique Id:',i[0])
        return recommendations

In [17]:
obj=job_postings("./dataset/dice_com-job_us_sample.csv")

## Start  recommending

In [18]:
user_id = 7
rows=obj.match_profile("./dataset/",user_id)

  dist = 1.0 - uv / np.sqrt(uu * vv)


The top 5 Recommendations for User  7  based on content-based filtering are:
Job Unique Id: 9c1dae8f8326ff44336cbc65c4145524
Job Unique Id: f7fac0c163a247d4f85c04e3dc823a7e
Job Unique Id: 3071e1d037c43c96e63d87b7f798904c
Job Unique Id: 3142c2dd6924df52d463d81ef93fb6e4
Job Unique Id: 0445fcb37ab17f686c025da15a98de52
