<h1>Event Recommendation Engine Challenge<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#导入工具包" data-toc-modified-id="导入工具包-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>导入工具包</a></span></li><li><span><a href="#处理程序" data-toc-modified-id="处理程序-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>处理程序</a></span></li><li><span><a href="#生成并保存处理结果" data-toc-modified-id="生成并保存处理结果-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>生成并保存处理结果</a></span></li><li><span><a href="#程序运行" data-toc-modified-id="程序运行-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>程序运行</a></span></li></ul></div>

# 导入工具包

In [1]:
import pickle
import numpy as np
import scipy.io as sio
import scipy.sparse as ss
from numpy.random import random  
from collections import defaultdict
import scipy.spatial.distance as ssd

# 处理程序

In [4]:
class RecommonderSystem(object):
    def __init__(self):
        # 读入数据做初始化
        # 数据地址
        self.dpath = 'E:/AI/00/data/EventRecommendation/'
        #用户和活动新的索引
        self.userIndex  = pickle.load(open(self.dpath+"PE_userIndex.pkl", 'rb'))
        self.eventIndex = pickle.load(open(self.dpath+"PE_eventIndex.pkl", 'rb'))
        self.n_users = len(self.userIndex)
        self.n_items = len(self.eventIndex)

        #用户-活动关系矩阵R
        #在train_SVD会重新从文件中读取,二者要求的格式不同，来不及统一了:(
        '''getrow(userIndex[]).getcol(eventIndex[])==1,感兴趣'''
        self.userEventScores = sio.mmread(self.dpath+"PE_userEventScores").todense()

        #倒排表(defaultdict)
        ''' {userIndex: eventIndex***}  {eventIndex: userIndex***}  '''
        ##每个用户参加的事件
        self.itemsForUser = pickle.load(open(self.dpath+"PE_eventsForUser.pkl", 'rb'))
        ##事件参加的用户
        self.usersForItem = pickle.load(open(self.dpath+"PE_usersForEvent.pkl", 'rb'))

        #根据活动属性计算出的活动之间的相似度
        '''#getrow(eventIndex[])代表该event与哪些event相似,相似度是多少'''
        self.eventPropSim = sio.mmread(self.dpath+"EV_eventPropSim").todense()
        self.eventContSim = sio.mmread(self.dpath+"EV_eventContSim").todense()

        #根据用户属性计算出的用户之间的相似度
        '''getrow(userIndex[])代表该user与哪些user相似,相似度是多少'''
        self.userSimMatrix = sio.mmread(self.dpath+"US_userSimMatrix").todense()

        #每个用户的朋友的数目
        '''#numFriends[:,userIndex[]]该user拥有的朋友的数量'''
        self.numFriends = sio.mmread(self.dpath+"UF_numFriends")
        #用户的每个朋友参加活动的分数对该用户的影响
        '''#getrow(userIndex[])代表该user有哪些朋友,值代表朋友参加活动的数量'''
        self.userFriends = sio.mmread(self.dpath+"UF_userFriends").todense()

        #活动本身的热度
        '''#getrow(eventIndex[])代表该event有哪些user参加, 值代表参加event的user的数量'''
        self.eventPopularity = sio.mmread(self.dpath+"EA_eventPopularity").todense()
        
        #提前训练模型
        self.init_SVD()
        self.train_SVD(trainfile=self.dpath+'train.csv')

    def init_SVD(self, K=25):
        #初始化模型参数（for 基于模型的协同过滤SVD_CF）
        self.K = K  

        #init parameters
        #bias
        self.bi = np.zeros(self.n_items)  
        self.bu = np.zeros(self.n_users)  

        #the small matrix
        self.P = random((self.n_users,self.K))/10*(np.sqrt(self.K))
        self.Q = random((self.K, self.n_items))/10*(np.sqrt(self.K))  


    def train_SVD(self,trainfile = 'train.csv', steps=100 ,gamma=0.04,Lambda=0.15):
        #训练SVD模型（for 基于模型的协同过滤SVD_CF）
        #gamma：为学习率
        #Lambda：正则参数
        #偷懒了，为了和原来的代码的输入接口一样，直接从训练文件中去读取数据
        print("SVD Train...")
        ftrain = open(trainfile, 'r')
        ftrain.readline()
        self.mu = 0.0
        n_records = 0
        u_ids = []  #每条记录的用户索引
        i_ids = [] #每条记录的item索引
        #用户-Item关系矩阵R（内容同userEventScores相同），临时变量，训练完了R不再需要
        R = np.zeros((self.n_users, self.n_items))

        for line in ftrain:
            cols = line.strip().split(",")
            u = self.userIndex[cols[0]]  #用户
            i = self.eventIndex[cols[1]] #活动

            u_ids.append(u)
            i_ids.append(i)

            R[u,i] = int(cols[4])  #interested
            self.mu += R[u,i] #感兴趣的次数
            n_records += 1    #共有多少条记录

        ftrain.close()
        self.mu /= n_records

        # 请补充完整SVD模型训练过程
        '''*************************************************************************************'''
        #self.P 和self.Q 均已初始化为随机数
        #self.bi和self.bu均已初始化为0
        
        for step in range(steps):
            rmse_sum=0.0  
            #产生随机数
            kku=np.random.permutation(len(u_ids))
            kki=np.random.permutation(len(i_ids))
            for j in range(len(u_ids)):
                #获得随机的u和i
                u = u_ids[kku[j]]
                i = i_ids[kki[j]]
                #损失函数
                e_ui = R[u,i] - self.pred_SVD(u,i)
                rmse_sum += e_ui**2
                #更新公式
                self.bu[u] += gamma*(e_ui-Lambda*self.bu[u])
                self.bi[i] += gamma*(e_ui-Lambda*self.bi[i])
                temp=self.Q[:,i]
                self.Q[:,i] += gamma*(e_ui*self.P[u,:] - Lambda*self.Q[:,i])
                self.P[u,:] += gamma*(e_ui*temp - Lambda*self.P[u,:])
#            gamma=gamma*0.95
            if step%10==0:
                print("rmse in step(%d) is :%f "%(step,np.sqrt(rmse_sum/len(u_ids))))

        print("the final rmse is: ",np.sqrt(rmse_sum/len(u_ids)))
        print("SVD trained ^*^")
            
    def pred_SVD(self, u_id, i_id):
        #根据当前参数，预测用户uid对Item（i_id）的打分        
        ans=self.mu + self.bi[i_id] + self.bu[u_id] + np.dot(self.P[u_id,:],self.Q[:,i_id])  

        #将打分范围控制在0-1之间
        if ans>1:  
            return 1  
        elif ans<0:  
            return 0
        return ans  

    def svdCFReco(self, userId, eventId):
        #基于模型的协同过滤, SVD++/LFM
        u = self.userIndex[userId]
        i = self.eventIndex[eventId]

        return self.pred_SVD(u,i)

    def sim_cal_UserCF(self, uid1, uid2 ):
        #请补充基于用户的协同过滤中的两个用户uid1和uid2之间的相似度
        #（根据两个用户对item打分的相似度）
        '''****************************************************************************************'''
        similarity=0.0
        P=self.itemsForUser[uid1]&self.itemsForUser[uid2]
        if len(P)<1: #如果两user无共同event, 返回simlarity=0.0
            return similarity

        r1=self.userEventScores[uid1,:].sum()/len(self.itemsForUser[uid1])
        r2=self.userEventScores[uid2,:].sum()/len(self.itemsForUser[uid2])
        rpa=0
        rpb=0
        rpab=0
        for p in P:
            ra=self.userEventScores[uid1,p]-r1
            rb=self.userEventScores[uid2,p]-r2
            rpa +=ra**2
            rpb +=rb**2
            rpab+=rpa*rpb
        if rpa*rpb ==0:
            return similarity
        else:
            similarity = rpab/(np.sqrt(rpa)*np.sqrt(rpb))
        return similarity  

    def userCFReco(self, userId, eventId):
        """
        根据User-based协同过滤，得到event的推荐度
        基本的伪代码思路如下：
        for item i
          for every other user v that has a preference for i
            compute similarity s between u and v
            incorporate v's preference for i weighted by s into running average
        return top items ranked by weighted average
        """
        #请补充完整代码
        '''******************************************************************************************'''
        ans = 0.0
        u=self.userIndex[userId]
        i=self.eventIndex[eventId]
        if len(self.itemsForUser[u])>0:
            ave_u=self.userEventScores[u,:].sum()/len(self.itemsForUser[u])
        else:
            ave_u=0
        vs=self.usersForItem[i]#出席该活动的用户有哪些
        sims=0
        
        for v in vs:
            sim=self.sim_cal_UserCF(u,v)
            if len(self.itemsForUser[v])>0:
                ave=self.userEventScores[v,:].sum()/len(self.itemsForUser[v])
            else:
                ave=0
            ans+=sim*(self.userEventScores[v,i]-ave)
            sims+=sim
        
        if sims>0:
            ans=ans/sims
        else:
            ans=0.0
#        print(sims)  
        return ans+ave_u



    def sim_cal_ItemCF(self, iid1, iid2):
        #计算Item i_id1和i_id2之间的相似性
        #请补充完整代码
        '''******************************************************************************************'''
        similarity=0.0
        U=self.usersForItem[iid1]&self.usersForItem[iid2]
        if len(U)<1: #如果两event无共同user, 返回simlarity=0.0
            return similarity

        rua=0
        rub=0
        ruab=0
        for u in U:
            ru=self.userEventScores[u,:].sum()/len(self.itemsForUser[u])
            ra=self.userEventScores[u,iid1]
            rb=self.userEventScores[u,iid2]
            rua +=(ra-ru)**2
            rub +=(rb-ru)**2
            ruab+=(ra-ru)*(rb-ru)
        if rua*rub ==0:
            return similarity
        else:
            similarity = ruab/(np.sqrt(rua)*np.sqrt(rub))
        return similarity  
    
    def eventCFReco(self, userId, eventId):    
        """
        根据基于物品的协同过滤，得到Event的推荐度
        基本的伪代码思路如下：
        for item i 
            for every item j that u has a preference for
                compute similarity s between i and j
                add u's preference for j weighted by s to a running average
        return top items, ranked by weighted average
        """
        #请补充完整代码
        '''*****************************************************************************************'''
        ans = 0.0
        sims=0
        u=self.userIndex[userId]
        i=self.eventIndex[eventId]
        js=self.itemsForUser[u]#该用户出席的活动有哪些
        for j in js:
            sim=self.sim_cal_ItemCF(i,j)
            ans+=sim*self.userEventScores[u,j]
            sims+=sim

        if sims>0:
            ans=ans/sims
        else:
            ans=0.0
#        print(sims)
        return ans
    

    def userReco(self, userId, eventId):
        """
        类似基于User-based协同过滤，只是用户之间的相似度由用户本身的属性得到，
        计算event的推荐度, 基本的伪代码思路如下：
        for item i
          for every other user v that has a preference for i
            compute similarity s between u and v
            incorporate v's preference for i weighted by s into running average
        return top items ranked by weighted average
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]

        vs = self.userEventScores[:, j]
        #[user,event]==1 有兴趣
        sims = self.userSimMatrix[i, :]
        #[user,user]=sim
        prod = sims * vs
        #标量, 内积

        try:
          return prod[0, 0] - self.userEventScores[i, j]
        except IndexError:
          return 0 #该user对该event不感兴趣

    def eventReco(self, userId, eventId):
        """
        类似基于Item-based协同过滤，只是item之间的相似度由item本身的属性得到，
        计算Event的推荐度, 基本的伪代码思路如下：
        for item i 
          for every item j that u has a preference for
            compute similarity s between i and j
            add u's preference for j weighted by s to a running average
        return top items, ranked by weighted average
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        js = self.userEventScores[i, :]
        psim = self.eventPropSim[:, j]
        csim = self.eventContSim[:, j]
        pprod = js * psim
        cprod = js * csim

        pscore = 0
        cscore = 0
        try:
          pscore = pprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
          pass
        try:
          cscore = cprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
          pass
        return pscore, cscore

    def userPop(self, userId):
        """
        基于用户的朋友个数来推断用户的社交程度
        主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
        """
        if userId in self.userIndex.keys():
          i = self.userIndex[userId]
          try:
            return self.numFriends[0, i]
          except IndexError:
            return 0
        else:
          return 0

    def friendInfluence(self, userId):
        """
        朋友对用户的影响
        主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的
        用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响
        """
        nusers = np.shape(self.userFriends)[1]
        i = self.userIndex[userId]
        return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]

    def eventPop(self, eventId):
        """
        本活动本身的热度
        主要是通过参与的人数来界定的
        """
        i = self.eventIndex[eventId]
        return self.eventPopularity[i, 0]

# 生成并保存处理结果

In [5]:
def generateRSData(RS, train=True, header=True):
    """
    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起
    生成新的训练数据，用于分类器分类使用
    """
    dpath = 'E:/AI/00/data/EventRecommendation/'
    fn = "train.csv" if train else "test.csv"
    fin = open(dpath+fn, 'r')
    fout = open(dpath+"RS_" + fn, 'w')
    
    #忽略第一行（列名字）
    fin.readline().strip().split(",")
    
    # write output header
    if header:
        ocolnames = ["invited", "userCF_reco", "evtCF_reco"
                     , "svdCF_reco","user_reco", "evt_p_reco", "evt_c_reco"
                     , "user_pop", "frnd_infl", "evt_pop"]
        if train:
            ocolnames.append("interested")
            ocolnames.append("not_interested")
        fout.write(",".join(ocolnames) + "\n")
    
    ln = 0
    for line in fin:
        ln += 1
        if ln%500 == 0:
            print("%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId))
            #break;
      
        cols = line.strip().split(",")
        userId = cols[0]
        eventId = cols[1]
        invited = cols[2]
      
        userCF_reco = RS.userCFReco(userId, eventId)
        itemCF_reco = RS.eventCFReco(userId, eventId)
        svdCF_reco = RS.svdCFReco(userId, eventId)
        
        user_reco = RS.userReco(userId, eventId)
        evt_p_reco, evt_c_reco = RS.eventReco(userId, eventId)
        user_pop = RS.userPop(userId)
     
        frnd_infl = RS.friendInfluence(userId)
        evt_pop = RS.eventPop(eventId)
        ocols = [invited, userCF_reco, itemCF_reco, svdCF_reco
                 , user_reco, evt_p_reco,evt_c_reco
                 , user_pop, frnd_infl, evt_pop]
      
        if train:
            ocols.append(cols[4]) # interested
            ocols.append(cols[5]) # not_interested
        fout.write(",".join(map(lambda x: str(x), ocols)) + "\n")
    
    fin.close()
    fout.close()

# 程序运行

In [6]:
RS = RecommonderSystem()
print ("生成训练数据...\n")
generateRSData(RS,train=True,  header=True)

print ("生成预测数据...\n")
generateRSData(RS, train=False, header=True)

SVD Train...
rmse in step(0) is :0.878364 
rmse in step(10) is :0.076376 
rmse in step(20) is :0.065484 
rmse in step(30) is :0.058740 
rmse in step(40) is :0.063818 
rmse in step(50) is :0.062691 
rmse in step(60) is :0.058490 
rmse in step(70) is :0.065820 
rmse in step(80) is :0.061421 
rmse in step(90) is :0.058201 
the final rmse is:  0.0589640874688
SVD trained ^*^
生成训练数据...

train.csv:500 (userId, eventId)=(123290209, 1887085024)
train.csv:1000 (userId, eventId)=(272886293, 199858305)
train.csv:1500 (userId, eventId)=(395305791, 1582270949)
train.csv:2000 (userId, eventId)=(527523423, 3272728211)
train.csv:2500 (userId, eventId)=(651258472, 792632006)
train.csv:3000 (userId, eventId)=(811791433, 524756826)
train.csv:3500 (userId, eventId)=(985547042, 1269035551)
train.csv:4000 (userId, eventId)=(1107615001, 173949238)
train.csv:4500 (userId, eventId)=(1236336671, 3849306291)
train.csv:5000 (userId, eventId)=(1414301782, 2652356640)
train.csv:5500 (userId, eventId)=(1595465532, 9