In [1]:
import random
import operator
import math

In [33]:
def read_data(fileDir = '../data/ml-100k/u.data'):
    data = []
    fp = open(fileDir,'r')
    for line in fp:
        user, item , _, __ = line.strip().split()
        data.append([user, item, 1.0]) # 默认兴趣度为1
    return data


def split_data(data, M, k, seed):
    test = []
    train = []
    random.seed(seed)
    for user, item, rate in data:
        if random.randint(0, M-1) == k:
            test.append([user, item, rate])
        else:
            train.append([user, item, rate])
    return train, test


def recall(train, test,result, N=3):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        rank = get_recommendation(user, N)
        for item, p in rank.items():
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)


def precision(train, test,result, N=3):
    hit = 0
    all = 0
    for user in train.keys():
        rank = result[user]
        for item, p in rank.items():
            if item in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)


def coverage(train, test, N):
    all_items = set()
    rec_items = set()
    for user in train.keys():
        for item in user:
            all_items.add(item)
            
        rank = get_recommandation(user, N)
        for item, p in rank:
            rec_items.add(item)
    
    return len(rec_items) / (len(all_items) * 1.0 )
        
        
def data_transform(data):
    """转为字典格式"""
    ret = dict()
    for u, i, r in data:
        ret.setdefault(u,dict())
        ret[u][i] = r
        
    return ret
        
def item_similarity(train):
    """ item相似度矩阵，W[i][j]表示i商品和j商品的相似度
    """ 
    N = dict() # item流行度
    C = dict() 
    for user,items in train.items():
        for i in items:
            N.setdefault(i,0)
            N[i] += 1
            C.setdefault(i,{})
            for j in items:
                if i == j:
                    continue
                C[i].setdefault(j,0)
                C[i][j] += 1
    
    # 计算相似度矩阵W
    W = C.copy()
    for i, related_items in C.items():
        for j, c_ij in related_items.items():
            W[i][j] = c_ij / math.sqrt(N[i] * N[j]) # 惩罚热门商品
    
    return W

def recommend(test, W, K=3, N=1000):
    """
    @param ru: 推荐用户的item列表
    @param W:  item相似度矩阵
    @param K:  每个item最相似商品数
    @param N:  给每个user推荐的商品数上限
    """
    ret = dict()
    for u, items in test.items():
        rank = get_recommendation(items, W, K)
#         if len(rank) > N:
#             ret.setdefault(u,dict())
#             for it, rating in rank.items():
#                 ret[u][it] = rating
        ret[u] = rank
    return ret

def get_recommendation(items, W, K=3):
    """
    @param user: 推荐用户的item列表
    @param W:    item相似度矩阵
    @param K:    每个item最相似商品数
    """
    rank = dict()
    for i, pi in items.items():
        if i not in W:
            continue
        for j, w_ij in sorted( W[i].items() ,key=operator.itemgetter(1), reverse = True)[0:K]:
            if j in items:
                continue # 过滤已有item
            rank.setdefault(j,0)
            rank[j] += pi * w_ij
            
    return rank  

In [8]:
# main
data = read_data()
train_times = 8

i = 0
rawTrain, rawTest = split_data(data, train_times, i, seed=0)
train = data_transform(rawTrain)
test = data_transform(rawTest)

K = 3    # 每个物品最相似的K个物品
N = 1000 # 为每个用户推荐的总商品数
W = item_similarity(train)

%time result = recommend(test, W, N)
# prec = precision(train, test, result, N)
# rec  = recall(train, test, result, N)

ValueError: not enough values to unpack (expected 2, got 1)

In [36]:
%time result = recommend(test, W, N)

Wall time: 9.23 s


In [35]:
result

{'1': {'100': 0.5809608834547153,
  '1047': 0.42980119129593963,
  '1132': 0.32496684180984076,
  '114': 0.34668762264076824,
  '1158': 0.4082482904638631,
  '1178': 0.47871355387816905,
  '12': 0.48334070138798946,
  '1304': 0.4082482904638631,
  '1312': 0.3061862178478973,
  '1321': 0.35355339059327373,
  '139': 0.35032452487268534,
  '1390': 0.5773502691896258,
  '14': 0.3921430353143951,
  '1437': 0.4364357804719848,
  '1482': 0.4082482904638631,
  '1501': 0.4743416490252569,
  '1508': 0.4082482904638631,
  '151': 0.39739912577018494,
  '1538': 0.5,
  '1621': 0.4082482904638631,
  '1655': 0.5773502691896258,
  '1663': 0.5773502691896258,
  '1669': 0.5773502691896258,
  '169': 0.5303300858899107,
  '174': 3.74523225849122,
  '181': 1.6198472571421103,
  '182': 0.39491311294798737,
  '188': 0.41513904361299825,
  '19': 0.3840122885898554,
  '195': 0.6177131613384916,
  '196': 0.609731236383489,
  '197': 0.5885819589993467,
  '204': 1.9420709054267427,
  '222': 0.39616605690399215,
  

In [None]:
sorted(d.items(),key = lambda item: item[0], reverse=True)

In [None]:
x,y = d['a'].items()

In [None]:
'b' in d

In [6]:
a  = {'a':1,'b':2}
for k in a.keys():
    print(k)

a
b


In [23]:
list(a.items())[0]

('a', 1)

In [24]:
test.items()

dict_items([('22', {'377': 1.0, '376': 1.0, '173': 1.0, '687': 1.0, '780': 1.0, '435': 1.0, '176': 1.0, '648': 1.0, '455': 1.0, '449': 1.0, '1003': 1.0, '211': 1.0, '163': 1.0, '688': 1.0, '290': 1.0, '731': 1.0}), ('160', {'234': 1.0, '484': 1.0, '693': 1.0, '137': 1.0, '23': 1.0, '230': 1.0, '218': 1.0, '13': 1.0, '589': 1.0, '955': 1.0, '213': 1.0, '628': 1.0, '302': 1.0, '127': 1.0}), ('50', {'246': 1.0}), ('290', {'88': 1.0, '161': 1.0, '393': 1.0, '476': 1.0, '404': 1.0, '91': 1.0, '450': 1.0, '162': 1.0, '139': 1.0, '15': 1.0, '403': 1.0, '926': 1.0, '97': 1.0, '1035': 1.0, '323': 1.0, '168': 1.0, '257': 1.0, '89': 1.0, '993': 1.0, '566': 1.0}), ('246', {'919': 1.0, '68': 1.0, '82': 1.0, '728': 1.0, '184': 1.0, '596': 1.0, '196': 1.0, '385': 1.0, '469': 1.0, '588': 1.0, '195': 1.0, '585': 1.0, '117': 1.0, '101': 1.0, '393': 1.0, '138': 1.0, '475': 1.0, '441': 1.0, '216': 1.0, '1139': 1.0, '981': 1.0, '401': 1.0, '798': 1.0, '69': 1.0, '1039': 1.0, '133': 1.0, '827': 1.0, '164': 