In [1]:
import pandas as pd
import numpy as np

In [2]:
class CS():
    def __init__(self, users_req, users_skill, jobs_req, jobs_skill, weight):
        self.users_req = users_req
        self.jobs_req = jobs_req
        self.users_skill = users_skill
        self.jobs_skill = jobs_skill
        self.weight = weight
        self.one_weight = 1-weight
        
        
    def similarity(self):
        row_u_r,col_u_r = self.users_req.shape
        row_u_s,col_u_s = self.users_skill.shape
        row_j_r,col_j_r = self.jobs_req.shape
        row_j_s,col_j_s = self.jobs_skill.shape
        
        from sklearn.metrics.pairwise import cosine_similarity
        similarity_req = cosine_similarity(self.users_req.reshape(row_u_r,col_u_r), self.jobs_req.reshape(row_j_r,col_j_r), dense_output=True)
        
        similarity_skill = cosine_similarity(self.users_skill.reshape(row_u_r,col_u_r), self.jobs_skill.reshape(row_j_r,col_j_r), dense_output=True)
        similarity = similarity_req * self.weight + similarity_skill * self.one_weight
        return similarity
    

In [3]:
class MF():
    
    def __init__(self, R, K, alpha, beta, iterations):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.
        
        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """
        
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    def train(self):
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
        
        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        
        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 10 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))
        
        return training_process

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            # Create copy of row of P since we need to update it but use older values for update on Q
            P_i = self.P[i, :][:]
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * P_i - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)


In [4]:
weight = 1.0

df0 = pd.read_csv('../data/user_data_0.csv')
df1 = pd.read_csv('../data/user_data_2.csv')
df2 = pd.read_csv('../data/user_data_3.csv')
df2 = df2.sort_index(axis = 1)
df = pd.concat([df1, df2], axis=1)
df = pd.concat([df0,df])
input_data = df0.to_numpy()
user_data = df.to_numpy()

#df = pd.read_csv('job_data_2.csv')
df1 = pd.read_csv('../data/job_data_2.csv')
df2 = pd.read_csv('../data/job_data_3.csv')
df2 = df2.sort_index(axis = 1)
df = pd.concat([df1, df2], axis=1)
job_data = df.to_numpy()
job_data = np.delete(job_data,0,1)

input_data

array([[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 0]])

In [5]:
cs0 = CS(input_data,input_data,job_data,job_data,weight)
cs1 = CS(user_data, user_data,job_data,job_data,weight)
input_similarity = cs0.similarity()
similarity = cs1.similarity()
input_similarity

array([[0.08111071, 0.26013299, 0.11470787, 0.14509525, 0.18731716,
        0.11470787, 0.11470787, 0.09365858, 0.09365858, 0.21764288,
        0.        , 0.19867985, 0.10259784, 0.11470787, 0.11470787,
        0.17342199, 0.        , 0.11470787, 0.30779351, 0.        ,
        0.        , 0.11470787, 0.21764288, 0.11470787, 0.        ,
        0.11470787, 0.18731716, 0.        , 0.18731716, 0.09365858,
        0.086711  , 0.        , 0.        , 0.        , 0.11470787,
        0.09365858, 0.11470787, 0.09365858, 0.09365858, 0.        ,
        0.28097574, 0.09365858, 0.15294382, 0.20519567, 0.11470787,
        0.11470787, 0.11470787, 0.09365858, 0.10259784, 0.11470787,
        0.        , 0.17342199, 0.18731716, 0.        , 0.20519567,
        0.16222142, 0.22941573, 0.11470787, 0.        , 0.20519567,
        0.07254763, 0.2901905 , 0.        , 0.        , 0.10259784,
        0.11470787, 0.13834289, 0.18731716, 0.11470787, 0.        ,
        0.08111071, 0.10259784, 0.11470787, 0.10

In [6]:
threshold = 0.25
result = np.transpose((input_similarity>threshold).nonzero())
print(result)
user_id = -1 
df1 = pd.read_csv('../data/user_data_1.csv') 
df1.fillna(0, inplace = True)
df2 = pd.read_csv('../data/job_data_1.csv') 
df2.fillna(0, inplace = True)
df3 = pd.DataFrame({'A' : []})
for i in result: 
#     if user_id != i[0]: 
#         print('-----------------------------------------------')
#         print(df1.loc[i[0]]) 
#         print('----------------recommendations---------------')
#         user_id = i[0] 
#    print('\u2022',df2.loc[i[1]])
    df3 = pd.concat([df3,df2.loc[i[1]]])
df3 = df3.drop(columns=['A'])
df3

[[ 0  1]
 [ 0 18]
 [ 0 40]
 [ 0 61]]


Unnamed: 0,0
ID,2
Company,University of Chicago/IT Services
Employment,Full Time
Country,"Shanghai, China"
ID,19
Company,Genesis10
Employment,"Full Time, Direct Placement"
Country,"Xinyi District, Taiwan"
ID,41
Company,CSI (Consultant Specialists Inc.)


In [None]:
from sklearn.preprocessing import normalize
similarity =normalize(similarity, axis=1, norm='l1')

mf = MF(similarity*5, K=6, alpha=0.1, beta=0.001, iterations=500)
training_process = mf.train()

Iteration: 10 ; error = 2.2227
Iteration: 20 ; error = 2.0284
Iteration: 30 ; error = 1.8811
Iteration: 40 ; error = 1.7630
Iteration: 50 ; error = 1.6360
Iteration: 60 ; error = 1.5482
Iteration: 70 ; error = 1.4692
Iteration: 80 ; error = 1.4189
Iteration: 90 ; error = 1.3864
Iteration: 100 ; error = 1.3372
Iteration: 110 ; error = 1.2977
Iteration: 120 ; error = 1.2701
Iteration: 130 ; error = 1.2427
Iteration: 140 ; error = 1.2285
Iteration: 150 ; error = 1.2201
Iteration: 160 ; error = 1.2063
Iteration: 170 ; error = 1.1871
Iteration: 180 ; error = 1.1800
Iteration: 190 ; error = 1.1746
Iteration: 200 ; error = 1.1930
Iteration: 210 ; error = 1.1720
Iteration: 220 ; error = 1.1631
Iteration: 230 ; error = 1.1612
Iteration: 240 ; error = 1.1622
Iteration: 250 ; error = 1.1623
Iteration: 260 ; error = 1.1582
Iteration: 270 ; error = 1.1548
Iteration: 280 ; error = 1.1569
Iteration: 290 ; error = 1.1572
Iteration: 300 ; error = 1.1555
Iteration: 310 ; error = 1.1483
Iteration: 320 ; 

In [60]:
#print(similarity*5, mf.full_matrix())
prediction = similarity*5-mf.full_matrix()
#print(prediction[0])
threshold = 0.05
result = np.transpose((prediction[0] < -threshold).nonzero())
#print(result)
#user_id = -1 
df3 = pd.DataFrame({'A' : []})
for i in result: 
#     if user_id != i[0]: 
#         print('-----------------------------------------------')
#         print(df1.loc[i[0]]) 
#         print('----------------recommendations---------------')
#         user_id = i[0] 
#    print('\u2022',df2.loc[i])   
    df3 = pd.concat([df3,df2.loc[i]])
df3 = df3.drop(columns=['A'])
df3

Unnamed: 0,ID,Company,Employment,Country
53,54.0,Genesis10,Full Time,"Waterloo, ON, Canada"
69,70.0,CSI (Consultant Specialists Inc.),"Contract W2, C2H W2, 6 months +","Shanghai, China"
84,85.0,Precision Task Group,0,"München, Germany"
