思想: 给用户推荐与其过去喜欢的物品相似的物品，而这里的相似式基于各个物品的内容来计算的，基于物品的协同过滤中的相似是基于用户行为来计算的。
步骤：
- 构造Item(物品)特征
- 基于物品内容计算每一个具体物品之间的相似度
- 根据用户过去的行为判断用户喜欢物品的偏好

物品的特征
- 结构化属性(特征)：直接拿来用
- 非结构化属性(特征)：需要进行进一步加工处理后才能使用(拆分或者标注)

计算相似度的算法
- 使用余弦相似度来度量用户的偏好与物品特征之间的余弦距离，余弦距离越大表示相似度越高，表示用户越对对该物品感兴趣

用户的喜欢
- 偏好模型

In [27]:
import pandas as pd
import json
import pickle
import numpy as np

In [3]:
class DataProcessing(object):
    
    def __init__(self,path_dir='../../data/ch5/ml-1m/'):
        """
        @params:
            path: the dir of raw data file
        """
        self.path_dir = path_dir 
        self.movie_file = path_dir+'movies.dat'
        self.user_file = path_dir+'users.dat'
        self.rating_file = path_dir+'ratings.dat'
    
    def process(self):
        """The entrance of processing
        """
        self.process_movie()
        self.process_user()
        self.process_rating()
    
    def process_movie(self):
        """process movie data
        """
        print('processing movie data...')
        movie_raw = pd.read_table(self.movie_file,sep='::',engine='python',names=['MovieID','Title','Genres'])
        movie_raw.to_csv(self.path_dir+'movie.csv',index=False)
    
    def process_user(self):
        """process user data
        """
        print('processing user data')
        user_raw = pd.read_table(self.user_file,sep='::',engine='python',names=['UserID','Gender','Age','Occupation','Zip-code'])
        user_raw.to_csv(self.path_dir+'user.csv',index=False)
        
    def process_rating(self):
        """process rating data
        """
        print('processing rating data')
        rating_raw = pd.read_table(self.rating_file,sep='::',engine='python',names=['UserID','MovieID','Rating','Timestamp'])
        rating_raw.to_csv(self.path_dir+'rating.csv',index=False)


In [4]:
process = DataProcessing()
process.process()

processing movie data...
processing user data
processing rating data




In [18]:
class BuildItemFeatures(object):
    """Build item features"""
    def __init__(self,path='../../data/ch5/ml-1m/movie.csv'):
        """
        @params:
            path: the path of the raw movie data
        """
        self.path = path
        
    def build_features(self):
        """build features
        """
        print('buiding...')
        movies = pd.read_csv(self.path)
        movie_ids = set(movies['MovieID'].values)
        self.movie_genres = {}
        all_genres = []
        p_count = 0
        for movie in movie_ids:
            
            genres = movies[movies['MovieID']==movie]['Genres']
            if p_count == 0:
                print(genres.values) # 返回一个list，list中包含各个index处的元素的属性值
                print(genres.values[0]) # 返回上面的list的第一个元素
                print(genres.values[0].split('|')) # 对上面返回的第一个元素进行split
            p_count += 1
            
            genres = genres.values[0].split("|")
            all_genres.extend(genres)
            self.movie_genres.setdefault(str(movie),[]).extend(genres)
        all_genres = list(set(all_genres))
        num_generes = len(all_genres)
        self.item_features = {}
        for movie in movie_ids:
            self.item_features.setdefault(str(movie),[0]*num_generes)
            for genre in self.movie_genres[str(movie)]:
                self.item_features[str(movie)][all_genres.index(genre)] = 1
        json.dump(self.item_features,open('../../data/ch5/item_features.json','w'))
        pickle.dump(all_genres,open('../../data/ch5/all_genres.pkl','wb'))
        json.dump(self.movie_genres,open('../../data/ch5/movie_genres.json','w'))
        print(all_genres)

In [19]:
build_movie_feature = BuildItemFeatures()
build_movie_feature.build_features()

buiding...
["Animation|Children's|Comedy"]
Animation|Children's|Comedy
['Animation', "Children's", 'Comedy']
['Adventure', 'War', 'Romance', 'Action', 'Documentary', 'Musical', 'Comedy', 'Drama', 'Animation', "Children's", 'Crime', 'Horror', 'Film-Noir', 'Mystery', 'Thriller', 'Fantasy', 'Sci-Fi', 'Western']


In [25]:
class BuildUserProfile(object):
    
    def __init__(self,user_path='../../data/ch5/ml-1m/rating.csv',rating_path='../../data/ch5/ml-1m/rating.csv',all_genres='../../data/ch5/all_genres.pkl'):
        """Build user profile
        @pramas:
            path
        """
        self.user_path = user_path
        self.rating_path = rating_path
        self.all_genres = pickle.load(open(all_genres,'rb'))
        self.movie_genres = json.load(open("../../data/ch5/movie_genres.json"))
        
    def build_user_profile(self):
        """
        """
        users = pd.read_csv(self.user_path)
        user_ids = list(map(str,users['UserID'].values))
        self.user_ratings = {}
        for user_id in user_ids:
            self.user_ratings.setdefault(user_id,{})
        first = True
        for line in open(self.rating_path,'r'):
            if first:
                first = False
                continue
            user,item,rate = line.split(',')[:3]
            self.user_ratings[user][item] = int(rate)
        
        self.user_profile = {}
        num_genres = len(self.all_genres)
        for user,items in self.user_ratings.items():
            scores = items.values()
            avg = sum(scores)/len(scores)
            self.user_profile.setdefault(user,[0]*num_genres)
            for i,genre in enumerate(self.all_genres):
                score = 0.0
                num = 0.0
                for item in items:
                    if genre in self.movie_genres[item]:
                        score += self.user_ratings[user][item] - avg
                        num += 1
                if num:
                    self.user_profile[user][i] = score/num
        
        json.dump(self.user_profile,open('../../data/ch5/user_profile.json','w'))
                    
        

In [26]:
build_user_profile = BuildUserProfile()
build_user_profile.build_user_profile()

In [30]:
class CBRec(object):
    """Content based recommendation system"""
    
    def __init__(self,user_profile='../../data/ch5/user_profile.json',movie_features='../../data/ch5/item_features.json'):
        """
        """
        self.user_profile = json.load(open(user_profile,'r'))
        self.movie_features = json.load(open(movie_features,'r'))
        
    def get_none_rec_movie(self,user):
        """获取没有给用户user推荐过的电影
        """
        ratings = pd.read_csv('../../data/ch5/ml-1m/rating.csv')
        have_recommended = ratings[ratings['UserID']==user]['MovieID'].values
        all_movies = set(self.movie_features.keys())
        non_recommended = set(all_movies) - set(have_recommended)
        return non_recommended
    
    def user_movie_similarity(self,user,movie):
        """计算用户对电影的偏好程度，用余弦相似性来表示偏好程度
        @params:
            user:
            movie:
        @return
            cos_sim
        """
        user = str(user)
        movie = str(movie)
        uia = sum(np.array(self.movie_features[movie])*np.array(self.user_profile[user]))
        ia = np.sqrt(np.sum(np.array(self.movie_features[movie])**2))
        ua = np.sqrt(np.sum(np.array(self.user_profile[user])**2))
        cos_sim = uia/(ua*ia)
        return cos_sim
    
    def recommend(self,user):
        movies = self.get_none_rec_movie(user)
        result = {}
        for movie in movies:
            sim = self.user_movie_similarity(user,movie)
            result[movie] = sim
        return sorted(result.items(),key=lambda x:x[1],reverse=True)[:10]
    

In [32]:
recommendation = CBRec()
print(recommendation.recommend(2))

[('599', 0.45997147302634817), ('3311', 0.45997147302634817), ('2921', 0.45997147302634817), ('416', 0.45997147302634817), ('3487', 0.45997147302634817), ('964', 0.45997147302634817), ('3373', 0.45997147302634817), ('553', 0.45997147302634817), ('210', 0.45997147302634817), ('3074', 0.45997147302634817)]
