推荐系统实战 张洛阳

In [1]:
from IPython.display import Latex

均方根误差和平均绝对误差

In [4]:
Latex(r"$RMSE=\frac{\sqrt{\sum_{u,i\in T}(r_{ui}-\hat{r}_{u,i})^2}}{|T|}$")

<IPython.core.display.Latex object>

In [5]:
Latex(r"$MAE=\frac{\sum_{u,i\in T}|r_{ui}-\hat{r}_{u,i}|}{|T|}$")

<IPython.core.display.Latex object>

令records[i]=[u,i,rui,pui]这里假设用户u对商品i的预测打分是pui，真实打分是rui

In [1]:
import math

In [7]:
def RMSE(recoders):
    error_power = map(lambda recoder: (recoder[2]-recoder[3])*(recoder[2]-recoder[3]) , recoders)
    error_power_sum = sum(error_power)
    error_sqrt = math.sqrt(error_power_sum)
    error_sqrt = error_sqrt / float(len(recoders))
    return error_sqrt

In [8]:
def MAE(recoders):
    error_absolute = map(lambda recoder: abs(recoder[2]-recoder[3]) , recoders)
    error_absolute = sum(error_absolute)/float(len(recoders))
    return error_absolute

TopN推荐的算法评估如下
R(u)是根据用户在训练集上的行为给用户的推荐列表，而T(u)是用户在测试集上的推荐列表

In [9]:
Latex(r"$Recall=\frac{\sum_{u \epsilon U}{|R(u)\bigcap  T(U)|}}{\sum_{u \epsilon U}|T(u)|}$")

<IPython.core.display.Latex object>

In [10]:
Latex(r"$Precision=\frac{\sum_{u \epsilon U}{|R(u)\bigcap  T(U)|}}{\sum_{u \epsilon U}|R(u)|}$")

<IPython.core.display.Latex object>

In [2]:
import random

In [3]:
def splitData(data, M, k, seed):
    test = []
    train = []
    random.seed(seed)
    for user, item, score in data:
        if random.randint(0,M)==k:
            test.append([user, item, score])
        else:
            train.append([user, item, score])
    return train, test

评价指标使用召回率和精确度

In [13]:
def Recall(train, test, N):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        rank = GetRecommendation(user, N)
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all*1.0)

In [14]:
def Precision(train, test, N):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        rank = GetRecommendation(user, N)
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += N
    return hit / (all*1.0)

推荐系统的覆盖率也是一个重要的评价指标，覆盖率越好，越能体现推荐系统挖掘出长尾的能力

In [15]:
Latex(r"$Coverage=\frac{U_{u \epsilon U} R(u)}{|I|}$")

<IPython.core.display.Latex object>

In [16]:
def Coverage(train, test, N):
    recommend_item = set()
    all_items = set()
    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)
        rank = GetRecommendation(user, N)
        for item, pui in rank:
            recommend_item.add(item)
    return len(recommend_item)/(len(all_items)*1.0)

推荐系统的新颖度 如果推荐的物品都很热门 说明推荐的新颖度较低

In [17]:
def Popularity(train, test, N):
    item_popularity = dict()
    for user, items in train.items():
        for item in items.keys():
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1
        ret = 0
        n = 0
        for user in train.keys():
            rank = GetRecommendation(user, N)
            for item, pui in rank:
                ret += math.log(1 + item_popularity[item])
                n += 1
        ret /= n * 1.0
        return ret

基于用户的协同过滤算法
第一步：找到和目标用户兴趣相似的用户集合
第二步：找到这个集合中的用户喜欢的，而且目标用户没有听说过的物品推荐给目标用户

计算两个用户之间的相似度可以使用Jaccard公式或者是余弦相似度

In [18]:
Latex(r"$W_{uv}=\frac{|N(u)\cap N(v)| }{|N(u) \cup N(v)|}$")

<IPython.core.display.Latex object>

In [19]:
Latex(r"$W_{uv}=\frac{|N(u) \cap N(v)|}{\sqrt{|N(u)| |N(v)|}}$")

<IPython.core.display.Latex object>

In [20]:
def UserSimilarity(train):
    W = dict()
    for u in train.keys():
        for v in train.keys():
            if u==v:
                continue
            W[u][v] = len(train[u] & train[v])
            W[u][v] /= math.sqrt(len(train[u]) * len(train[v]) *1.0 )
    return W

上述计算相似度时候的代价比较高，一般的做法是如下首先建立一个二维的表格

In [21]:
def UserSimilarity(train):
    item_users = dict()
    for u, items in train.items(): 
        for i in items.keys(): 
            if i not in item_users:
                item_users[i] = set()
            print type(item_users[i])
            item_users[i].add[u]
    # calculate co-rated items between users
    C = dict()
    N = dict()
    for i, users in item_users.items():
        for u in users:
            N[u] += 1
            for v in users:
                if u==v:
                    continue
                C[u][v] += 1
    # calculate finial similarity matrix W
    W = dict()
    for u, related_users in C.items():
        for v, cuv in related_users.items():
            W[u][v] = cuv / math.sqrt(N[u]*N[v])
    return W

得到用户之间的兴趣相似度后，UserCF算法会给用户推荐和他兴趣最相似的K个用户喜欢的物品。

In [22]:
Latex(r"$p(u,i) = \sum_{v \epsilon S(u, k) \cap N(i)}{w_{uv}r_{vi}}$")

<IPython.core.display.Latex object>

r_{vi}表示用户v对i的感兴趣程度 w_{uv}表示用户u和v的相似性程度

In [23]:
def Recommend(user, train, W):
    rank = dict()
    interacted_items = train[user]  # 用户历史item和商品评分数据
    for v, wuv in sorted(W[user].items, key=interacted_items(1), reverse=True)[0:k]:  # 找出用户u最相似的前k个用户
        for i, rvi in train[v].items: # 找出这k个用户的数据
            if i in interacted_items:
                continue
            rank[i] += muv * rvi  # 用户u与用户v的相似度*用户u对商品i的相似度
    return rank

In [4]:
# 推荐系统数据集
# 需要统计用户数目 商品数目
with open("ml-1m/ratings.dat", "r") as f:
    lines = f.readlines()

In [5]:
lines = map(lambda line:line.strip() , lines)
datas = []
for line in lines:
    d = line.split("::")[0:3]
#     datas.append([int(d[0]), int(d[1]), int(d[2])]) # user_id item_id score
    datas.append([int(d[0]), int(d[1]), 1]) # user_id item_id score

In [6]:
data = datas
M = 20
k = 10
seed = 1

In [7]:
train, test = splitData(data, M, k, seed)

In [8]:
print "训练集:{}".format(len(train))
print "测试集:{}".format(len(test))

训练集:952458
测试集:47751


建立用户 user item 商品 表格

In [9]:
print train[0]

[1, 1193, 1]


In [10]:
train_dict = {}  # user_id {item_id:score}
test_dict = {}
for user, item, score in train:
    if user not in train_dict:
        train_dict[user] = {}
    train_dict[user][item]=score
for user, item, score in test:
    if user not in test_dict:
        test_dict[user] = {}
    test_dict[user][item]=score

In [11]:
print len(train_dict)
print len(train_dict[1])

6040
51


In [32]:
def UserSimilarity(train):
    item_users = dict()
    for u, items in train.items(): 
        for i in items.keys(): 
            if i not in item_users:
                item_users[i] = set()
            item_users[i].add(u)
    # calculate co-rated items between users
    C = dict()
    N = dict()
    for i, users in item_users.items():
        for u in users:
            if u  not in N:
                N[u] = 1
            N[u] += 1
            if u not in C:
                C[u] = dict()
            for v in users:
                if u==v:
                    continue
                if v not in C[u]:
                    C[u][v] = 0
                C[u][v] += 1 / math.log(1 + len(users))
    # calculate finial similarity matrix W
    W = dict()
    for u, related_users in C.items():
        if u not in W:
            W[u]=dict()
        for v, cuv in related_users.items():
            W[u][v] = cuv / math.sqrt(N[u]*N[v])
    return W

In [33]:
W = UserSimilarity(train_dict)

In [34]:
print type(train_dict[1].items)

<type 'builtin_function_or_method'>


In [35]:
def Recommend(user, train, W, k):
    rank = dict()
    interacted_items = train[user]  # 用户历史item和商品评分数据
    k_users_smi = sorted(W[user].items(), key=lambda x:x[1], reverse=True)[0:k]
    for v, wuv in k_users_smi:  # 找出用户u最相似的前k个用户
        for i, rvi in train[v].items(): # 找出这k个用户的数据
            if i in interacted_items:
                continue
            if i not in rank:
                rank[i] = 0
            rank[i] += wuv * rvi  # 用户u与用户v的相似度*用户u对商品i的相似度
    return rank

In [36]:
def GetRecommendation(user, train, W, N, k):
    rank = Recommend(user, train, W, k)
    return sorted(rank.items(), key=lambda x:x[1], reverse=True)[0:N]

In [37]:
def Precision(train, test, W, N, k):
    hit = 0
    all = 0
    for user in train.keys():
        if user in test:  # 保证测试集存在
            tu = test[user]
            rank = GetRecommendation(user, train, W, N, k)
            for item, pui in rank:
                if item in tu:
                    hit += 1
            all += N
    return hit / (all*1.0)

In [38]:
Precision(train_dict, test_dict, W, 10, k=100)

0.12669159544159544

In [39]:
def Recall(train, test, W, N, k):
    hit = 0
    all = 0
    for user in train.keys():
        if user in test: 
            tu = test[user]
            rank = GetRecommendation(user, train, W, N, k)
            for item, pui in rank:
                if item in tu:
                    hit += 1
            all += len(tu)
    return hit / (all*1.0)

In [40]:
Recall(train_dict, test_dict, W, 10, k=100)

0.14900211513894998

In [46]:
def ItemSimilarity(train):
    # calculate co-rated users between items
    C = dict()
    N = dict()
    for u, items in train.items():
        for i in items:
            if i not in N:
                N[i] = 0
            N[i] += 1
            if i not in C:
                C[i] = dict()
            for j in items:
                if i==j:
                    continue
                if j not in C[i]:
                    C[i][j] = 0
                C[i][j] += 1
    W = dict()
    for i, related_items in C.items():
        if i not in W:
            W[i] = dict()
        for j, cij in related_items.items():
            W[i][j] = cij/math.sqrt(N[i]*N[j])
    return W

In [47]:
W2 = ItemSimilarity(train_dict)

In [56]:
def Recommendation(train, user_id, W, k):
    rank = dict()
    ru = train[user_id]
    for i, pi in ru.items():
        k_item_smi = sorted(W[i].items(), key=lambda x:x[1], reverse=True)[0:k]
        for j, wj in k_item_smi:
            if j in ru:
                continue
            if j not in rank:
                rank[j] = 0
            rank[j] += pi*wj
    return rank

In [57]:
def getRecommendation(train, user_id, W, N, k):
    rank = Recommendation(train, user_id , W, k)
    return sorted(rank.items(), key=lambda x: x[1], reverse=True)[0:N]

In [61]:
getRecommendation(train_dict, 1, W2, 10, 10)

[(364, 4.8522944807562745),
 (318, 4.369869824633019),
 (1196, 4.354937499794326),
 (2080, 4.127196653457239),
 (1282, 3.92622862301438),
 (2571, 3.913764325330487),
 (296, 3.529667880170433),
 (2174, 3.4799394567872257),
 (593, 3.4673681400135914),
 (2858, 3.381508172672512)]

In [12]:
from recommendation import ItemItemRecommendation

In [13]:
model = ItemItemRecommendation()

In [14]:
model.itemSimilarity(train_dict)

In [15]:
model.precision(train_dict, test_dict, 10, 100)

KeyboardInterrupt: 