In [156]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

In [311]:
import numpy as np
import pandas as pd
import sys

In [158]:
# データからitem/userの配列を用意したい。
df = pd.read_csv("./ml-100k/u.data", sep="\t", names=["user_id","item_id","rating", "timestamp"])

In [159]:
df.max()

user_id            943
item_id           1682
rating               5
timestamp    893286638
dtype: int64

In [160]:
users_number = df.max().ix["user_id"]
items_number = df.max().ix["item_id"]

In [161]:
# userとitemのpairを表す二次元配列
# 縦user943行、横item1682列の零配列
user_item_pairs = np.zeros([users_number,items_number])

In [162]:
df.head(5)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [163]:
df.tail(5)

Unnamed: 0,user_id,item_id,rating,timestamp
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156
99999,12,203,3,879959583


In [390]:
# 映画の情報のデータ
df_movie = pd.read_csv("./ml-100k/u.item", sep='|', encoding="ISO-8859-1",header=None).ix[:,0:1]

In [391]:
df_movie.head(5)

Unnamed: 0,0,1
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [404]:
df_movie.ix[0][1]

'Toy Story (1995)'

In [403]:
movie_title_list=[]
# 映画の情報のデータ
print(aaa)
for i in aaa:
    movie_title_list.append(df_movie.ix[i][1])
print(movie_title_list)

[  72  192  556  409 1300  264  427   58  202  212]
['Maverick (1994)', 'Right Stuff, The (1983)', 'Farinelli: il castrato (1994)', 'Kingpin (1996)', 'Stripes (1981)', 'Hunt for Red October, The (1990)', 'Harold and Maude (1971)', 'Three Colors: Red (1994)', 'Unforgiven (1992)', 'Room with a View, A (1986)']


In [164]:
# user_id,item_idに対応するratingの値を、user_item_pairsに代入していく。
# user_id,item_idは1から始まっているので、それぞれ-1している。
for i in range(len(df)):
    user_item_pairs[df.ix[i]["user_id"]-1][df.ix[i]["item_id"]-1] = df.ix[i]["rating"]

In [265]:
user_item_pairs

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [166]:
#コサイン類似度
def cos_similarity(x,y):
    return np.dot(x,y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [294]:
#ユーザー同士の類似度を配列で返す
def user_user_similarity(pairs):
    users_sim = np.zeros((users_number, users_number))
    
    for i in range(users_number):
        for j in range(i,users_number):
            if i == j:
                users_sim[i][j] = 1.0
            else:
                sim_score = cos_similarity(pairs[i], pairs[j])
                users_sim[i][j] = sim_score
                users_sim[j][i] = sim_score
                
    return users_sim

In [466]:
score = user_user_similarity(user_item_pairs)

In [467]:
score

array([[ 1.        ,  0.16693098,  0.04745954, ...,  0.14861694,
         0.17950788,  0.39817474],
       [ 0.16693098,  1.        ,  0.11059132, ...,  0.16148478,
         0.17226781,  0.10579788],
       [ 0.04745954,  0.11059132,  1.        , ...,  0.10124256,
         0.13341615,  0.02655587],
       ..., 
       [ 0.14861694,  0.16148478,  0.10124256, ...,  1.        ,
         0.1016418 ,  0.09511958],
       [ 0.17950788,  0.17226781,  0.13341615, ...,  0.1016418 ,
         1.        ,  0.18246466],
       [ 0.39817474,  0.10579788,  0.02655587, ...,  0.09511958,
         0.18246466,  1.        ]])

In [471]:
#  user_idとユーザー間類似度と、user/itemの対応リストを与えた時に、user_idのユーザーに映画をリコメンドする。
# user_idは-1して用いるのに注意
# kは推薦する映画の本数
# lは上位L人の高い類似度をもつユーザーを対象にする
def movie_recommend(user_id, users_sim, user_item_pairs, k, l):
    
    # エラー処理
    if user_id-1 not in range(users_number):
        print("No such a user_id. You need to update \"u.data\" for recommendation.")
        sys.exit(1)
    if k >1682:
        print("Error with number of recommendation.")
        sys.exit(1)
    if l > 943:
        print("Error with number of users.")
        sys.exit(1)

    #     重みの合計
    total_weight = 0
    sim_scores = []
    
    #user_idの人との類似度が高いL人の人を対象に考える
    #それ以外の人の類似度は考えないため、０で置き換える
    high_sim_users_indices = np.argpartition(-users_sim[user_id-1], l)[:l]
    l_max = users_sim[user_id-1][high_sim_users_indices]
#     dididi = np.argsort(-l_max)
#     print(l_max)
#     print(sorted(users_sim[user_id-1]))
    for x in range(users_number):
        if x in high_sim_users_indices:
            pass
        else:
            users_sim[user_id-1][x] = 0
    
#     print(users_sim)
    
    for i in range(items_number):
        # まだ評価のない（０）ものを対象に処理を行う
        if user_item_pairs[user_id-1][i] == 0:
            
            # 作品iへの評価値と、user_idの人への類似度を掛け合わせた重みの合計を内積を用いて計算
            # 自分自身への類似度は1になるため、その分を差し引いている
            total_weight = np.dot(users_sim[user_id-1],user_item_pairs.T[i])
            
            #たくさんの人に評価された作品の重みは大きくなる
            #作品iを評価している(!=0)評価者の類似度の合計を求め、正規化する
            for_normalization = 0            
            for j in range(users_number):
                if user_item_pairs[j][i] != 0:
                    for_normalization += users_sim[user_id-1][j]
            if for_normalization != 0:
                similarity_score = total_weight / for_normalization
            else:
                similarity_score  = 0
#             print(total_weight, for_normalization,similarity_score)
            sim_scores.append(similarity_score)

    print(sim_scores)
    #正規化した類似度スコアの内、上位K件のitem_idを降順にソートして取得する
    sim_scores_array = np.array(sim_scores)
    unsorted_max_indices = np.argpartition(-sim_scores_array, k)[:k]
    k_max = sim_scores_array[unsorted_max_indices]
#     print(k_max)
    indices = np.argsort(-k_max)
    max_k_indices = unsorted_max_indices[indices]
    
    return max_k_indices, k_max

In [472]:
aaa = movie_recommend(3,score,user_item_pairs, 30, 31)

[5.0, 0, 3.0, 0, 0, 0, 3.2160940852630167, 0, 3.0277380608657247, 2.5228783559598229, 4.480102776286623, 4.0, 3.6751980523059129, 0, 4.4792953129237114, 0, 0, 0, 0, 3.0, 3.0, 4.0, 0, 4.0, 2.4882629107981216, 0, 0, 0, 3.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.0, 0, 0, 4.3208239603592151, 0, 0, 0, 0, 0, 5.0, 0, 0, 0, 0, 0, 4.0, 0, 0, 2.0, 0, 0, 0, 1.0, 4.0, 0, 0, 0, 0, 0, 0, 0, 0, 4.5557927066103847, 0, 0, 4.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.0, 0, 0, 0, 2.0, 0, 0, 4.210341136011567, 0, 0, 0, 0, 3.0, 0, 0, 0, 4.0, 0, 2.5686350678794683, 0, 0, 0, 0, 3.0, 3.0457567119196454, 3.0, 0, 2.0, 3.7158976138418467, 3.0, 4.0, 2.8059545534220502, 3.0, 3.9813562742220108, 4.170553714520862, 0, 3.0, 1.0, 0, 4.9999999999999991, 4.9999999999999991, 0, 0, 0, 2.665193289429193, 0, 0, 0, 0, 0, 0, 0, 2.0, 0, 4.0131011127978002, 3.0, 0, 3.9585906258474233, 5.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.0, 0, 0, 5.0, 4.0, 0, 0, 3.0, 0, 5.0, 0, 0, 0, 4.0, 0, 4

In [473]:
aaa

(array([ 921, 1537,  920,  834,  201,   55, 1120,  179,  312,  465,  473,
           0,  150, 1238,  540,  185,  464,  458,  131,  132, 1557,  456,
          78,  274,   10,   14,  420,  297,  299,   49]),
 array([ 5.        ,  5.        ,  5.        ,  5.        ,  5.        ,
         5.        ,  5.        ,  5.        ,  5.        ,  5.        ,
         5.        ,  5.        ,  5.        ,  5.        ,  5.        ,
         5.        ,  5.        ,  4.49809348,  5.        ,  5.        ,
         5.        ,  4.56167188,  4.65419853,  4.55579271,  4.48010278,
         4.35457461,  4.47929531,  4.35485542,  4.32678575,  4.32082396]))