In [1]:
import pandas as pd
import numpy as np

下面是A B C D四位用户对one 到seven 七件商品的评分表

In [2]:
data=pd.DataFrame({'one':[4,np.nan,2,np.nan],
                  'two':[np.nan,4,np.nan,5],
                  'three':[5,np.nan,2,np.nan],
                  'four':[3,4,np.nan,3],
                  'five':[5,np.nan,1,np.nan],
                  'six':[np.nan,5,np.nan,5],
                  'seven':[np.nan,np.nan,np.nan,4]},
                  index=list('ABCD'),columns=['one','two','three','four','five','six','seven'])

In [3]:
data

Unnamed: 0,one,two,three,four,five,six,seven
A,4.0,,5.0,3.0,5.0,,
B,,4.0,,4.0,,5.0,
C,2.0,,2.0,,1.0,,
D,,5.0,,3.0,,5.0,4.0


## ** 目标 **：
#### 1.找到A最相似的其他用户
#### 2.预测A对two商品的评分，做出是否推荐的判断

余弦相似度：通过计算两个向量的夹角余弦值来评估他们的相似度

0度角的余弦值是1，而其他任何角度的余弦值都不大于1；并且其最小值是-1

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
data.loc['A'].values

array([ 4., nan,  5.,  3.,  5., nan, nan])

In [6]:
data.loc['A'].fillna(0)

one      4.0
two      0.0
three    5.0
four     3.0
five     5.0
six      0.0
seven    0.0
Name: A, dtype: float64

In [7]:
data.loc['A'].fillna(0).values

array([4., 0., 5., 3., 5., 0., 0.])

In [8]:
data.loc['A'].fillna(0).values.reshape(1,-1)

array([[4., 0., 5., 3., 5., 0., 0.]])

In [9]:
sim_AB=cosine_similarity(data.loc['A'].fillna(0).values.reshape(1,-1),
                         data.loc['B'].fillna(0).values.reshape(1,-1))

In [10]:
sim_AB

array([[0.18353259]])

In [11]:
sim_AC=cosine_similarity(data.loc['A'].fillna(0).values.reshape(1,-1),
                         data.loc['C'].fillna(0).values.reshape(1,-1))

In [12]:
sim_AC  #A C 评分差距大，但是得出相似度很高，说明之前fillna(0)不合适，不评分变成评分为0

array([[0.88527041]])

去中心化(使均值为0)

In [13]:
data_center=data.apply(lambda x:x-x.mean(),axis=1)  #x指行,每行减去均值，该行均值即为0

In [14]:
data_center

Unnamed: 0,one,two,three,four,five,six,seven
A,-0.25,,0.75,-1.25,0.75,,
B,,-0.333333,,-0.333333,,0.666667,
C,0.333333,,0.333333,,-0.666667,,
D,,0.75,,-1.25,,0.75,-0.25


In [15]:
sim_AB=cosine_similarity(data_center.loc['A'].fillna(0).values.reshape(1,-1),
                         data_center.loc['B'].fillna(0).values.reshape(1,-1))

In [16]:
sim_AB

array([[0.30772873]])

In [17]:
sim_AC=cosine_similarity(data_center.loc['A'].fillna(0).values.reshape(1,-1),
                         data_center.loc['C'].fillna(0).values.reshape(1,-1))

In [18]:
sim_AC

array([[-0.24618298]])

In [19]:
sim_AD=cosine_similarity(data_center.loc['A'].fillna(0).values.reshape(1,-1),
                         data_center.loc['D'].fillna(0).values.reshape(1,-1))

In [20]:
sim_AD

array([[0.56818182]])

In [21]:
data

Unnamed: 0,one,two,three,four,five,six,seven
A,4.0,,5.0,3.0,5.0,,
B,,4.0,,4.0,,5.0,
C,2.0,,2.0,,1.0,,
D,,5.0,,3.0,,5.0,4.0


根据 B D 预测A对two的评分

In [23]:
A_two=(sim_AB*data.loc['B','two']+sim_AD*data.loc['D','two'])/(sim_AB+sim_AD)

In [24]:
A_two

array([[4.64867562]])

** 电影数据评分数据集读取与透视表 **

In [25]:
unames=['user_id','gender','age','occupation','zip']
users=pd.read_csv(r'E:\3cschool\python_shuju_0\movielens\users.dat',sep='::',header=None,names=unames,engine='python')

In [26]:
users.head(3)

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117


In [27]:
rnames=['user_id','movie_id','rating','timestamp']
ratings=pd.read_table(r'E:\3cschool\python_shuju_0\movielens\ratings.dat',sep='::',header=None,names=rnames,engine='python')

In [28]:
ratings.head(3)  #rating :评分

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


In [29]:
mnames=['movie_id','title','genres']
movies=pd.read_table(r'E:\3cschool\python_shuju_0\movielens\movies.dat',sep='::',header=None,names=mnames,engine='python')

In [30]:
movies.head(3) #title:电影名

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [31]:
#可用merge 横向合并三个数据集

透视表

In [32]:
data=pd.pivot_table(ratings,index='user_id',columns='movie_id',values='rating')

In [33]:
data.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,


In [34]:
data.loc[5,6] #索引取值

2.0

为user_id=1的用户推荐电影

In [35]:
#思路：找出相似度高的用户，再推荐高分的电影

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6040 entries, 1 to 6040
Columns: 3706 entries, 1 to 3952
dtypes: float64(3706)
memory usage: 171.0 MB


In [37]:
#去中心化

In [38]:
data_center=data.apply(lambda x:x-x.mean(),axis=1)

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
sim_cos=[]
for i in range(len(data)):
    sim_=cosine_similarity(data_center.iloc[0].fillna(0).values.reshape(1,-1),
                        data_center.iloc[i].fillna(0).values.reshape(1,-1))
    sim_cos.append(sim_)

In [41]:
sim_cos=[x[0][0] for x in sim_cos]

In [42]:
data=data.assign(sim=sim_cos)

In [43]:
data.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3944,3945,3946,3947,3948,3949,3950,3951,3952,sim
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,1.0
2,,,,,,,,,,,...,,,,,,,,,,0.032665
3,,,,,,,,,,,...,,,,,,,,,,-0.032267
4,,,,,,,,,,,...,,,,,,,,,,0.016359
5,,,,,,2.0,,,,,...,,,,,,,,,,-0.016774


In [44]:
data=data.sort_values(by='sim',ascending=False)

In [45]:
data.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,3944,3945,3946,3947,3948,3949,3950,3951,3952,sim
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,1.0
1337,,,,,,,,,,,...,,,,,,,,,,0.189242
379,,,,,,,,,,,...,,,,,,,,,,0.159893
5404,5.0,,,,,,,,,,...,,,,,,,,,,0.155154
49,5.0,,,,,,,,,,...,,,,,,,,,,0.148455


In [69]:
data1=data.iloc[1:6].copy()

In [70]:
data1

movie_id,1,2,3,4,5,6,7,8,9,10,...,3944,3945,3946,3947,3948,3949,3950,3951,3952,sim
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1337,,,,,,,,,,,...,,,,,,,,,,0.189242
379,,,,,,,,,,,...,,,,,,,,,,0.159893
5404,5.0,,,,,,,,,,...,,,,,,,,,,0.155154
49,5.0,,,,,,,,,,...,,,,,,,,,,0.148455
2607,,,,,,,,3.0,,,...,,,,,,,,,,0.148105


已找出5个相似的用户，现在根据这几个用户评分推荐user_id=1电影

In [48]:
#A_two=(sim_AB*data.loc['B','two']+sim_AD*data.loc['D','two'])/(data.loc['B','two']+data.loc['D','two'])

In [49]:
#相似用户很多电影没有评分，需抛弃这些列

In [71]:
data1.dropna(axis=1,how='all',inplace=True)

In [72]:
data1

movie_id,1,8,11,13,17,21,25,32,34,36,...,3776,3785,3789,3793,3916,3926,3927,3928,3930,sim
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1337,,,,,,,,,5.0,,...,3.0,,,,5.0,4.0,4.0,4.0,3.0,0.189242
379,,,4.0,,4.0,,,,,,...,,,,,,,,,,0.159893
5404,5.0,,4.0,,,,5.0,2.0,3.0,4.0,...,,,,,,,,,,0.155154
49,5.0,,,4.0,,3.0,,,,,...,,3.0,,3.0,,,,,,0.148455
2607,,3.0,,,,,,,3.0,3.0,...,,,3.0,,,,,,,0.148105


In [52]:
#计算user_id=1用户对其他movie_id=34(第9列) 的评分

In [106]:
ratings_=(data1.sim*data1.iloc[:,8]).sum()/((~(data1.iloc[:,8].isnull())).astype('int')*data1.sim).sum()

In [107]:
ratings_

3.7684924105087516

In [108]:
((~(data1.iloc[:,8].isnull())).astype('int')*data1.sim).sum() #求有评分的user对应的sim总和

0.49250065961253164

In [111]:
(data1.sim*data1.iloc[:,8]).sum()

1.8559849979203795