 隐含语义模型用于个性化推荐

In [1]:
from IPython.display import Latex

In [2]:
Latex(r"$Preference(u,i)=r_{ui}=p_{u}^{T}q_{i}=\sum_{f=1}^{F}p_{u,k}q_{i,k}$")

<IPython.core.display.Latex object>

其中p_{u,k}表示第u个用户对第k个隐含变量的关系, q_{i,k}表示第k个隐含变量与物品i之间的关系
如果直接在显性反馈数据上面，这种方法可以得到较好的数据，但是在隐性数据上面，这种方法很难得到较好的方法

In [3]:
import math
import random
def splitData(data, M, k, seed):
    test = []
    train = []
    random.seed(seed)
    for user, item, score in data:
        if random.randint(0,M)==k:
            test.append([user, item, score])
        else:
            train.append([user, item, score])
    return train, test

In [4]:
with open("ml-1m/ratings.dat", "r") as f:
    lines = f.readlines()
lines = map(lambda line:line.strip() , lines)
datas = []
for line in lines:
    d = line.split("::")[0:3]
#     datas.append([int(d[0]), int(d[1]), int(d[2])]) # user_id item_id score
    datas.append([int(d[0]), int(d[1]), 1]) # user_id item_id score
data = datas
M = 20
k = 10
seed = 1
train, test = splitData(data, M, k, seed)

In [5]:
train_dict = {}  # user_id {item_id:score}
test_dict = {}
for user, item, score in train:
    if user not in train_dict:
        train_dict[user] = {}
    train_dict[user][item]=score
for user, item, score in test:
    if user not in test_dict:
        test_dict[user] = {}
    test_dict[user][item]=score

In [6]:
item_popularity = dict()
for user, items in train_dict.items():
    for item in items.keys():
        if item not in item_popularity:
            item_popularity[item] = 0
        item_popularity[item] += 1

In [7]:
items_pool_counts = sorted(item_popularity.items(), key=lambda x:x[1], reverse=True)

In [8]:
items_pool = []
for item, counts in items_pool_counts:
    for i in range(counts):
        items_pool.append(item)  

In [9]:
def RandomSelectNegativeSample(items):  # items是某一个用户的商品列表
    ret = dict()
    for i in items.keys():
        ret[i] = 1
    n = 0
    for i in range(0, len(items)*3):
        item = items_pool[random.randint(0, len(items_pool)-1)]  # 按照商品的流行程度进行选择
        if item in ret:
            continue
        ret[item] = 0
        n += 1
        if n > len(items):
            break
    return ret

In [10]:
random.random()

0.5195477984670157

In [15]:
def InitModel(user_items, F):
    P = dict()
    Q = dict()
    all_user = set()
    all_item = set()
    for user, items in user_items.items():
        all_user.add(user)
        for item in items.keys():
            all_item.add(item)
    for user in all_user:
        if user not in P:
            P[user] = dict()
        for i in range(F):
            P[user][i] = random.random()
    for item in all_item:
        if item not in Q:
            Q[item] = dict()
        for i in range(F):
            Q[item][i] = random.random()
        
    return P, Q

In [19]:
def Predict(P, Q, user, item, F):
    p = P[user]
    q = Q[item]
    predict = 0
    for i in range(F):
        predict += p[i]*q[i]
    return predict

In [21]:
def LatentFactorModel(user_items, F, N, alpha, lambda_):
    [P, Q] = InitModel(user_items, F)
    for step in range(N):
        for user, items in user_items.items():
            samples = RandomSelectNegativeSample(items)
            for item, rui in samples.items():
                eui = rui - Predict(P, Q, user, item, F)
                for f in range(F):
                    P[user][f] += alpha*(eui*Q[item][f]- lambda_*P[user][f])
                    Q[item][f] += alpha*(eui*P[user][f]- lambda_*Q[item][f])
        alpha *= 0.9
    return P, Q

In [22]:
P,Q = LatentFactorModel(train_dict, 100, 10, 0.02, 0.01)

In [23]:
Predict(P, Q, 1, 2, 100)

0.35552530443761465