In [41]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import pairwise_distances

In [42]:
# データ取得
data_base = pd.read_csv(
    './data_base.csv',
    names=["user_id","job_id","rating","timestamp"],
    sep="\t")
data_base.head()

data_train = pd.read_csv(
    './data_training.csv',
    names=["user_id","job_id","rating","timestamp"],
    sep="\t"
)

data_test = pd.read_csv(
    './data_test.csv',
    names=["user_id","job_id","rating","timestamp"],
    sep="\t"
)

In [50]:
# データ整備
job_list = data_base.sort_values('job_id').job_id.unique() # 並び替え
user_list = data_base.user_id.unique() # 並び替え
rating_matrix_job = np.zeros([len(job_list), len(user_list)])

In [44]:
for job_id in tqdm(range(1, len(job_list))): # 進捗表示
    user_list_job = data_train[data_train['job_id'] == job_id].sort_values('user_id').user_id.unique()
    for user_id in user_list_job:
        try:
            user_rate = data_train[(data_train['job_id'] == job_id) & (data_train['user_id'] == user_id)].loc[:, 'rating']
        except None as e:
            user_rate = 0 # 該当なしの場合は0をセット
        rating_matrix_job[job_id-1, user_id-1] = user_rate # 評価をセット

100%|██████████| 1605/1605 [00:34<00:00, 46.36it/s] 


In [45]:
# 評価
rating_matrix_calc = rating_matrix_job.copy() #
rating_matrix_calc[rating_matrix_calc != 0] = 1
rating_matrix_train = np.abs(rating_matrix_calc - 1) # 絶対値変換

In [46]:
#　類似度
similarity_matrix = 1 - pairwise_distances(rating_matrix_job, metric='cosine') # コサイン類似度
np.fill_diagonal(similarity_matrix, 0)

In [47]:
# レコメンド算出
user_id = 100
hits = 0

In [48]:
# 類似度×スコア
rating_matrix_user = rating_matrix_job[:, user_id - 1]
pre_rating_user = similarity_matrix * rating_matrix_user
pre_rating_user = pre_rating_user.sum(axis=1)
pre_rating_user_job = pre_rating_user * rating_matrix_train[:,user_id - 1]
# 該当率計算
recommend_list = np.argsort(pre_rating_user_job)[::-1][:10] + 1
purchase_list_user = data_test[data_test.user_id == user_id].loc[:, 'job_id'].unique()
for job_id in recommend_list:
    if job_id in purchase_list_user:
        hits += 1
pre = hits / 10

In [49]:
# レコメンド結果出力
print('Recommend list:', recommend_list)
print('Recommend list count:', len(recommend_list))
print('Test Rated list:', purchase_list_user)
print('Test Rated list count:', len(purchase_list_user))
print('Precision:', str(pre))

Recommend list: [405 597 546 117 237 411   7 763 410 121]
Recommend list count: 10
Test Rated list: [100 243 257 261 322 328 456 872 876 948]
Test Rated list count: 10
Precision: 0.0


In [None]:
# 精度評価
precision_list = []
recall_list = []
user_list_test = data_test.sort_values('user_id').user_id.unique()

In [None]:
for user_id in tqdm(user_list_test):
    hits = 0
    rating_matrix_user = rating_matrix_job[:, user_id - 1]
    pre_rating_user = similarity_matrix * rating_matrix_user
    pre_rating_user_job = pre_rating_user * rating_matrix_train[:,user_id - 1]
    pre_rating_user_job[np.isnan(pre_rating_user_job)] = 0
    recommend_list = np.argsort(pre_rating_user_job)[::-1][:10] + 1
    purchase_list_user = data_test[data_test.user_id == user_id].loc[:, 'job_id'].unique()
    if len(purchase_list_user) == 0:
        continue
    for job_id in recommend_list:
        if job_id in purchase_list_user:
            hits += 1
    pre = hits / 10.0
    precision_list.append(pre)

In [None]:
# 評価結果出力
precision = sum(precision_list) / len(precision_list)
print('Precision:', precision)