In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import pairwise_distances

In [3]:
# データ取得（各ユーザーの商品の閲覧数）
data_base = pd.read_csv(
    './data_base.csv',
    names=["user_id","job_id","view_count","timestamp"],
    sep="\t")
data_train = pd.read_csv(
    './data_training.csv',
    names=["user_id","job_id","view_count","timestamp"],
    sep="\t")
data_test = pd.read_csv(
    './data_test.csv',
    names=["user_id","job_id","view_count","timestamp"],
    sep="\t")

In [4]:
# データ整備
job_list = data_base.sort_values('job_id').job_id.unique() # 並び替え
user_list = data_base.user_id.unique() # 並び替え
view_job = np.zeros([len(job_list), len(user_list)])

In [5]:
# 学習データ整形
for job_id in tqdm(range(1, len(job_list))): # 進捗表示
    user_list_job = data_train[data_train['job_id'] == job_id].sort_values('user_id').user_id.unique()
    for user_id in user_list_job:
        try:
            user_rate = data_train[(data_train['job_id'] == job_id) & (data_train['user_id'] == user_id)].loc[:, 'view_count']
        except None as e:
            user_rate = 0 # 該当なしの場合は0をセット
        view_job[job_id - 1, user_id - 1] = user_rate # 評価をセット

100%|██████████| 1605/1605 [00:33<00:00, 47.53it/s] 


In [6]:
# 閲覧数（閲覧数は評価点の見立てとする）
view_job_calc = view_job.copy()
view_job_calc[view_job_calc != 0] = 1
view_train = np.abs(view_job_calc - 1) # 絶対値変換

In [7]:
#　類似度
similarity = 1 - pairwise_distances(view_job, metric='cosine') # コサイン類似度
np.fill_diagonal(similarity, 0)

In [8]:
# レコメンド算出
user_id = 100
hits = 0

In [14]:
# 閲覧(評価)点の算出
view_user = view_job[:, user_id - 1]
est_view_user = similarity * view_user # 類似度×閲覧数
est_view_user = est_view_user.sum(axis=1) # 各評価の合算
est_view_user_job = est_view_user * view_train[:, user_id - 1]
# 該当率計算
recommend_list = np.argsort(est_view_user_job)[::-1][:10] + 1 # レコメンド数指定
purchase_list_user = data_test[data_test.user_id == user_id].loc[:, 'job_id'].unique()
for job_id in recommend_list:
    if job_id in purchase_list_user:
        hits += 1
pre = hits / 10

[302 181 354 100 237 268 750 748 301 332  50 307 288 340  11 117 245  22
 121 276]


In [15]:
# レコメンド結果出力
print('Recommend list:', recommend_list)
print('Recommend list count:', len(recommend_list))
print('Rated list:', purchase_list_user)
print('Rated list count:', len(purchase_list_user))
print('Precision:', str(pre))

Recommend list: [302 181 354 100 237 268 750 748 301 332]
Recommend list count: 10
Rated list: [266 268 288 302 321 340 344 354 355 750]
Rated list count: 10
Precision: 0.4


In [None]:
# 精度評価
precision_list = []
recall_list = []
user_list_test = data_test.sort_values('user_id').user_id.unique() # 評価用データ

In [None]:
for user_id in tqdm(user_list_test):
    hits = 0
    view_user = view_job[:, user_id - 1]
    est_view_user = similarity * view_user
    est_view_user_job = est_view_user * view_train[:, user_id - 1]
    est_view_user_job[np.isnan(est_view_user_job)] = 0
    recommend_list = np.argsort(est_view_user_job)[::-1][:20] + 1
    purchase_list_user = data_test[data_test.user_id == user_id].loc[:, 'job_id'].unique()
    # 該当率計算
    if len(purchase_list_user) == 0:
        continue
    for job_id in recommend_list:
        if job_id in purchase_list_user:
            hits += 1
    pre = hits / 10.0
    precision_list.append(pre)

In [None]:
# 評価結果出力
precision = sum(precision_list) / len(precision_list)
print(len(precision_list))
print('Precision:', precision)