In [1]:
import random
import operator
import math

In [15]:
def read_data(fileDir = '../data/ml-100k/u.data'):
    data = []
    fp = open(fileDir,'r')
    for line in fp:
        user, item , _, __ = line.strip().split()
        data.append([user, item, 1.0]) # 默认兴趣度为1
    return data


def split_data(data, M, k, seed):
    test = []
    train = []
    random.seed(seed)
    for user, item, rate in data:
        if random.randint(0, M-1) == k:
            test.append([user, item, rate])
        else:
            train.append([user, item, rate])
    return train, test


def recall(train, test,result, N=3):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
#         rank = result[user] # rank和test的用户不同
        rank = get_recommendation(user, N)
        for item, p in rank.items():
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)


def precision(train, test,result, N=3):
    hit = 0
    all = 0
    for user in train.keys():
        rank = result[user]
        for item, p in rank.items():
            if item in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)


def coverage(train, test, N):
    all_items = set()
    rec_items = set()
    for user in train.keys():
        for item in user:
            all_items.add(item)
            
        rank = get_recommandation(user, N)
        for item, p in rank:
            rec_items.add(item)
    
    return len(rec_items) / (len(all_items) * 1.0 )
        
        
def data_transform(data):
    """转为字典格式"""
    ret = dict()
    for u, i, r in data:
        ret.setdefault(u,dict())
        ret[u][i] = r
        
    return ret
        
def item_similarity(train):
    """ item相似度矩阵，W[i][j]表示i商品和j商品的相似度
    """ 
    N = dict() # item流行度
    C = dict() 
    for user,items in train.items():
        for i in items:
            N.setdefault(i,0)
            N[i] += 1
            C.setdefault(i,{})
            for j in items:
                if i == j:
                    continue
                C[i].setdefault(j,0)
                C[i][j] += 1
    
    # 计算相似度矩阵W
    W = C.copy()
    for i, related_items in C.items():
        for j, c_ij in related_items.items():
            W[i][j] = c_ij / math.sqrt(N[i] * N[j]) # 惩罚热门商品
    
    return W

def recommend(test, W, K=3):
    """
    @param ru: 推荐用户的item列表
    @param W:  item相似度矩阵
    @param K:  为每个user推荐的商品数
    """
    ret = dict()
    for u, items in test.items():
        rank = get_recommendation(items, W, K)
        ret[u] = rank
    return ret

def get_recommendation(items, W, K=3):
    """
    @param user: 推荐用户的item列表
    @param W:    item相似度矩阵
    @param K:    为每个user推荐的商品数
    """
    rank = dict()
    for i, pi in items:
        for j, w_ij in sorted( W[i].items() ,key=operator.itemgetter(1), reverse = True)[0:K]:
            if j in user:
                continue # 过滤已有item
            rank.setdefault(j,0)
            rank[j] += pi * w_ij
            
    return rank   

In [8]:
# main
data = read_data()
train_times = 8

i = 0
rawTrain, rawTest = split_data(data, train_times, i, seed=0)
train = data_transform(rawTrain)
test = data_transform(rawTest)

N = 3 # 为每个用户推荐的商品数
W = item_similarity(train)

%time result = recommend(test, W, N)
# prec = precision(train, test, result, N)
# rec  = recall(train, test, result, N)

ValueError: not enough values to unpack (expected 2, got 1)

In [16]:
%time result = recommend(test, W, N)

ValueError: too many values to unpack (expected 2)

NameError: name 'result' is not defined

In [None]:
sorted(d.items(),key = lambda item: item[0], reverse=True)

In [None]:
x,y = d['a'].items()

In [None]:
'b' in d

In [6]:
a  = {'a':1,'b':2}
for k in a.keys():
    print(k)

a
b


100000