基于隐含语义模型进行推荐
用隐含语义模型去拟合用户是否对物品产生了相应的行为
- 训练模型
- 保存模型
- 用模型进行预测

In [1]:
import numpy as np
import pandas as pd
import pickle
import json
import os

In [10]:
class DataProcessing(object):
    
    def __init__(self,rating_file='../../data/ch5/ml-1m/rating.csv'):
        self.rating_file = rating_file
        
    def build_rating_matrix(self,file_path='../../data/ch5/activity_matrix.json'):
        """
        """
        print('Build rating matrix...')
        if not os.path.exists(file_path):
            self.ratings = pd.read_csv(self.rating_file)
            self.user_ids = set(list(map(str,self.ratings['UserID'].values)))
            self.movie_ids = set(list(map(str,self.ratings['MovieID'].values)))
            activity_matrix = {}
            for user_id in self.user_ids:
                activity_matrix[user_id] = self.get_samples(user_id)
            json.dump(activity_matrix,open(file_path,'w'))
    
    def get_samples(self,user_id):
        """获得用户user_id的正负各一半的训练集
        @params:
            user_id
        """
        pos_samples = set(self.ratings[self.ratings['UserID']==int(user_id)]['MovieID'].values)
        neg_samples = list(self.movie_ids ^ pos_samples)[:len(pos_samples)] # set不可以inedx
        moives = {}
        for movie in pos_samples:
            moives[str(movie)] = 1
        for movie in neg_samples:
            moives[str(movie)] = 0
        return moives
        

In [12]:
dp = DataProcessing()
dp.build_rating_matrix()

In [17]:
class LFMRec(object):
    
    def __init__(self,nums_class=5,activity_matrix='../../data/ch5/activity_matrix.json',ratings='../../data/ch5/ml-1m/rating.csv'):
        self.nums_class = nums_class
        self.activity_matrix = json.load(open(activity_matrix,'r'))
        self.ratings = pd.read_csv(ratings)
        self._init_model()
        
    def _init_model(self):
        """初始化模型
        初始化QP
        """
        self.movie_ids = list(map(str,set(self.ratings['MovieID'].values)))
        self.user_ids = list(map(str,set(self.ratings['UserID'].values)))
        P = np.random.randn(len(self.user_ids),self.nums_class)
        Q = np.random.randn(len(self.movie_ids),self.nums_class)
        self.P = pd.DataFrame(P,index=self.user_ids,columns=range(self.nums_class))
        self.Q = pd.DataFrame(Q,index=self.movie_ids,columns=range(self.nums_class))
    
    def _predict(self,user,movie):
        """计算user喜欢movie的概率
        @params:
            user
            movie
        @return
        """
        p = self.P.ix[user].values
        q = self.Q.ix[movie].values
        logit = sum([i*j for i,j in zip(p,q)])
        # pred = self._sigmoid(logit)
        return logit        
    
    def _sigmoid(self,x):
        """计算sigmoid
        @pramas
        """
        return 1/(1+np.exp(-x))
    
    def _loss(self,user,movie,y):
        """计算损失函数
        @params:
            user:
            movie:
            y:
        @return
        """
        logit = self._predict(user,movie)
        loss = self._sigmoid(logit)-y
        return loss
    
    def _optimize(self,user,movie,y,lr=0.02,lamda=0.01):
        """进行梯度下降优化
        """
        # 计算梯度，梯度下降
        logit = self._predict(user,movie)
        partial_p = 2*(self._sigmoid(logit)-y)*self._sigmoid(logit)*self._sigmoid(-logit)*self.Q.ix[movie].values + 2*lamda*self.P.ix[user].values
        partial_q = 2*(self._sigmoid(logit)-y)*self._sigmoid(logit)*self._sigmoid(-logit)*self.P.ix[user].values + 2*lamda*self.Q.ix[movie].values
        self.P.ix[user] -= lr*partial_p
        self.Q.ix[movie] -= lr*partial_q
        
        
    def train(self,epochs):
        """Train the model
        @pramas:
            epochs:
        """
        display = 0
        for epoch in range(epochs):
            for user,movies in self.activity_matrix.items():
                movies = np.random.permutation(list(movies.keys()))
                for movie in movies:
                    y = self.activity_matrix[user][movie]
                    if display%10000 == 0:
                        print("dispaly {}, user {}, movie {}, real {}, pred {}, loss {}".format(display,user,movie,y,self._sigmoid(self._predict(user,movie)),abs(self._loss(user,movie,y))))
                    self._optimize(user,movie,y)
                    display += 1
        print('训练结束')
        self.save()
        
    def save(self,path='../../data/ch5/model/lmrec.pkl'):
        """保存模型
        @params:
        """
        pickle.dump((self.P,self.Q),open(path,'wb'))
    
    def load(self,path='../../data/ch5/model/lmrec.pkl'):
        """加载模型
        @parmas
        """
        print("加载模型")
        self.P,self.Q = pickle.load(open(path,'rb'))
        
    def recommend(self,user):
        """给用户user推荐电影
        @params
            user
        """
        self.load()
        have_recommended = self.ratings[self.ratings['UserID']==int(user)]['MovieID'].values
        non_recommended = set(self.movie_ids)-set(map(str,have_recommended))
        result = {}
        for movie in non_recommended:
            result[movie] = self._predict(user,movie)
        return sorted(result.items(),key = lambda x:x[1],reverse=True)
    
    def evaluate(self):
        """对该推荐系统进行评测
        @params
        """
        self.load()
        result = {}
        for user in self.user_ids[:10]:
            result.setdefault(user,0)
            have_recommended = self.ratings[self.ratings['UserID']==int(user)]['MovieID'].values
            for movie in have_recommended:
                result[user] = (self.activity_matrix[user][movie]-self._sigmoid(self._predict(user,movie)))**2
            result[user] /= len(have_recommended)
        return result
            


In [18]:
lfmRec = LFMRec()

In [19]:
lfmRec.train(1)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#

dispaly 0, user 5530, movie 2054, real 1, pred 0.5942335253021271, loss 0.40576647469787286
dispaly 10000, user 5234, movie 348, real 0, pred 0.5495041647324418, loss 0.5495041647324418
dispaly 20000, user 984, movie 150, real 0, pred 0.6147330160692556, loss 0.6147330160692556
dispaly 30000, user 1084, movie 3683, real 0, pred 0.287007614057224, loss 0.287007614057224
dispaly 40000, user 655, movie 2950, real 0, pred 0.5956306330178083, loss 0.5956306330178083
dispaly 50000, user 726, movie 2210, real 0, pred 0.09239660451850029, loss 0.09239660451850029
dispaly 60000, user 1620, movie 3400, real 0, pred 0.8474619985440036, loss 0.8474619985440036
dispaly 70000, user 1641, movie 2329, real 1, pred 0.06235664553762676, loss 0.9376433544623732
dispaly 80000, user 1592, movie 3865, real 1, pred 0.4697724605801293, loss 0.5302275394198708
dispaly 90000, user 718, movie 2006, real 1, pred 0.055526490806734885, loss 0.9444735091932651
dispaly 100000, user 855, movie 3186, real 1, pred 0.092

dispaly 860000, user 1022, movie 1611, real 1, pred 0.546802367282856, loss 0.45319763271714397
dispaly 870000, user 1137, movie 3910, real 1, pred 0.7789304377405117, loss 0.22106956225948826
dispaly 880000, user 5444, movie 2806, real 1, pred 0.05918719647205547, loss 0.9408128035279445
dispaly 890000, user 4513, movie 2974, real 0, pred 0.4844385145282542, loss 0.4844385145282542
dispaly 900000, user 5690, movie 1704, real 1, pred 0.08595591377190166, loss 0.9140440862280983
dispaly 910000, user 3454, movie 2965, real 0, pred 0.9479652961917545, loss 0.9479652961917545
dispaly 920000, user 3611, movie 2108, real 1, pred 0.7974969850084531, loss 0.20250301499154688
dispaly 930000, user 5767, movie 1968, real 1, pred 0.5367018498227234, loss 0.4632981501772766
dispaly 940000, user 4624, movie 440, real 1, pred 0.6226922565717113, loss 0.37730774342828866
dispaly 950000, user 3610, movie 2700, real 0, pred 0.4991395412622221, loss 0.4991395412622221
dispaly 960000, user 2195, movie 414

训练结束


FileNotFoundError: [Errno 2] No such file or directory: '../../data/ch5/model/lmrec.pkl'