In [126]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import pairwise_distances

In [127]:
# データ取得
data_base = pd.read_csv(
    './base.csv',
    names=["user_id","job_id","rating","timestamp"],
    sep="\t")
# data_base.head()

In [128]:
data_train = pd.read_csv(
    './training.csv',
    names=["user_id","job_id","rating","timestamp"],
    sep="\t"
)
# data_train.head()

In [129]:
data_test = pd.read_csv(
    './test.csv',
    names=["user_id","job_id","rating","timestamp"],
    sep="\t"
)
# data_test.head()

In [130]:
# データ整備
job_list = data_base.sort_values('job_id').job_id.unique()
user_list = data_base.user_id.unique()
rating_matrix_job = np.zeros([len(job_list), len(user_list)])

In [131]:
for job_id in tqdm(range(1, len(job_list))):
    user_list_job = data_train[data_train['job_id'] == job_id].sort_values('user_id').user_id.unique()
    for user_id in user_list_job:
        try:
            user_rate = data_train[(data_train['job_id'] == job_id) & (data_train['user_id'] == user_id)].loc[:, 'rating']
        except:
            user_rate = 0
        rating_matrix_job[job_id-1, user_id-1] = user_rate

100%|██████████| 1605/1605 [00:43<00:00, 37.22it/s] 


In [132]:
# スコア
rating_matrix_calc = rating_matrix_job.copy()
rating_matrix_calc[rating_matrix_calc != 0] = 1
rating_matrix_train = np.abs(rating_matrix_calc - 1)

In [133]:
#　類似度
similarity_matrix = 1 - pairwise_distances(rating_matrix_job, metric='cosine') # コサイン類似度で距離の行列計算
np.fill_diagonal(similarity_matrix, 0) # 計算コストが高いため対角成分を取る

In [134]:
# レコメンド算出
user_id = 100
hits = 0

In [135]:
# 類似度×スコア
rating_matrix_user = rating_matrix_job[:, user_id - 1]
pred_rating_user = similarity_matrix * rating_matrix_user
pred_rating_user = pred_rating_user.sum(axis=1)
pred_rating_user_job = pred_rating_user * rating_matrix_train[:,user_id - 1]
recommend_list = np.argsort(pred_rating_user_job)[::-1][:10] + 1
purchase_list_user = data_test[data_test.user_id == user_id].loc[:, 'job_id'].unique()
for job_id in recommend_list: # 適合商品数/全商品数（適合率：Precision）
    if job_id in purchase_list_user:
        hits += 1
pre = hits / 10.0

In [137]:
# レコメンド結果出力
print('Recommend list:', recommend_list)
print('Test Rated list:', purchase_list_user)
print('Precision:', str(pre))

Recommend list: [302 181 354 100 237 268 750 748 301 332]
Test Rated list: [266 268 288 302 321 340 344 354 355 750]
Precision: 0.4


In [139]:
# 精度評価
precision_list = []
recall_list = []
user_list_test = data_test.sort_values('user_id').user_id.unique()

In [141]:
for user_id in tqdm(user_list_test):
    hits = 0
    rating_matrix_user = rating_matrix_job[:, user_id - 1]
    pred_rating_user = similarity_matrix * rating_matrix_user
    pred_rating_user_job = pred_rating_user * rating_matrix_train[:,user_id - 1]
    pred_rating_user_job[np.isnan(pred_rating_user_job)] = 0
    recommend_list = np.argsort(pred_rating_user_job)[:-1][1] + 1
    purchase_list_user = data_test[data_test.user_id == user_id].loc[:, 'job_id'].unique()
    if len(purchase_list_user) == 0:
        continue
    for job_id in recommend_list:
        if job_id in purchase_list_user:
            hits += 1
    pre = hits / 10.0
    precision_list.append(pre)

 88%|████████▊ | 829/943 [01:12<00:09, 11.50it/s]


IndexError: index 829 is out of bounds for axis 1 with size 829

In [None]:
# 結果出力
precision = sum(precision_list) / len(precision_list)
print('Precision:', precision)