# Assignment7

## Task1
计算作业中数据的用户相似矩阵和物品相似矩阵（手算加计算器）  

In [37]:
import pandas as pd
import numpy as np
import math

In [38]:
#### 余弦相似度
def calcuteSimilar(series1,series2):
    '''
    计算余弦相似度
    :param data1: 数据集1 Series
    :param data2: 数据集2 Series
    :return: 相似度
    '''
    unionLen = len(set(series1) & set(series2))
    if unionLen == 0: return 0.0
    product = len(series1) * len(series2)
    similarity = unionLen / math.sqrt(product)
    return similarity

In [39]:
# 用户相似矩阵
user_item = [
    ['a','b','d','f'],
    ['a','c','d','g'],
    ['b','e','h'],
    ['c','d','e','h','g'],
    ['a','e','c']
]

n = len(user_item)
matrix1=np.zeros((n,n))

for i in range(0,n):
    for j in range(0,n):
        if i != j:
            matrix1[i,j]=calcuteSimilar(user_item[i],user_item[j])
matrix1

array([[0.        , 0.5       , 0.28867513, 0.2236068 , 0.28867513],
       [0.5       , 0.        , 0.        , 0.67082039, 0.57735027],
       [0.28867513, 0.        , 0.        , 0.51639778, 0.33333333],
       [0.2236068 , 0.67082039, 0.51639778, 0.        , 0.51639778],
       [0.28867513, 0.57735027, 0.33333333, 0.51639778, 0.        ]])

In [40]:
# 物品相似矩阵
item_user = [
    ['A','B','E'],
    ['A','C'],
    ['B','D','E'],
    ['A','B','D'],
    ['C','D','E'],
    ['A'],
    ['B','D']
]

m = len(item_user)
matrix2=np.zeros((m,m))

for i in range(0,m):
    for j in range(0,m):
        if i != j:
            matrix2[i,j]=calcuteSimilar(item_user[i],item_user[j])
matrix2

array([[0.        , 0.40824829, 0.66666667, 0.66666667, 0.33333333,
        0.57735027, 0.40824829],
       [0.40824829, 0.        , 0.        , 0.40824829, 0.40824829,
        0.70710678, 0.        ],
       [0.66666667, 0.        , 0.        , 0.66666667, 0.66666667,
        0.        , 0.81649658],
       [0.66666667, 0.40824829, 0.66666667, 0.        , 0.33333333,
        0.57735027, 0.81649658],
       [0.33333333, 0.40824829, 0.66666667, 0.33333333, 0.        ,
        0.        , 0.40824829],
       [0.57735027, 0.70710678, 0.        , 0.57735027, 0.        ,
        0.        , 0.        ],
       [0.40824829, 0.        , 0.81649658, 0.81649658, 0.40824829,
        0.        , 0.        ]])

In [41]:
from keras.preprocessing.text import Tokenizer
tok = Tokenizer()
tok.fit_on_texts(user_item)
tok.texts_to_matrix(user_item)

array([[0., 1., 1., 0., 0., 1., 0., 0., 1.],
       [0., 1., 1., 1., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 1., 0., 1., 0.],
       [0., 0., 1., 1., 1., 0., 1., 1., 0.],
       [0., 1., 0., 1., 1., 0., 0., 0., 0.]])

In [42]:
tok.word_index

{'a': 1, 'd': 2, 'c': 3, 'e': 4, 'b': 5, 'g': 6, 'h': 7, 'f': 8}

## Task2
(代码：电影数据）  
1. 使用UserCF给用户1找最相似的十个用户  
2. 使用itemCF找电影101最相似的十部电影  

In [43]:
train =pd.read_csv('./ml-latest-small/ratings.csv')[['userId','movieId']]

In [44]:
# 相似用户
targetuser = train[train['userId'] == 1]['movieId'].astype(str)
simuser_ID = [i for i in set(train['userId']) if i != 1]  
otherUsers = [train[train['userId'] == i]['movieId'].astype(str) for i in simuser_ID]

similarlist = [calcuteSimilar(targetuser,user) for user in otherUsers]
similarSeries = pd.Series(similarlist,index=simuser_ID)
similarSeries.sort_values()[-10:]

577    0.320800
368    0.324379
266    0.327865
45     0.328677
452    0.332593
39     0.334831
469    0.334905
57     0.337032
330    0.338371
313    0.380978
dtype: float64

In [45]:
# 相似电影
targetproduct = train[train['movieId'] == 101]['userId'].astype(str)
simproduct_ID = [i for i in set(train['movieId']) if i != 101]  
otherProducts = [train[train['movieId'] == i]['userId'].astype(str) for i in simproduct_ID]

similarlist = [calcuteSimilar(targetproduct,item) for item in otherProducts]
similarSeries = pd.Series(similarlist,index=simproduct_ID)
similarSeries.sort_values()[-10:]

32584    0.417029
39234    0.417029
2997     0.419130
556      0.425628
1449     0.425628
3181     0.425628
46974    0.425628
2395     0.445823
1235     0.449823
7117     0.466252
dtype: float64

### 在评估用户相似度时考虑打分
用评价过的电影的评分的绝对差值之和乘共同评价过的电影个数的余弦相似度，作为两个用户相似度的评价指标

In [62]:
def calSimilarty(series1,series2):
    unionLen = len(set(series1[['movieId']]) & set(series2[['movieId']]))
    if unionLen == 0: return 0.0
    product = len(series1['movieId']) * len(series2['movieId'])
    similarity = unionLen / math.sqrt(product)
    
    merge_matrix = series1.merge(series2,on='movieId')
    merge_matrix['diff'] = np.abs(merge_matrix['rating_x']-merge_matrix['rating_y'])
    
    return similarity*merge_matrix['diff'].sum()    

In [63]:
# 相似用户
train = pd.read_csv('./ml-latest-small/ratings.csv')[['userId','movieId','rating']]
targetuser = train[train['userId'] == 1][['movieId','rating']]
simuser_ID = [i for i in set(train['userId']) if i != 1]  
otherUsers = [train[train['userId'] == i][['movieId','rating']] for i in simuser_ID]

similarlist = [calSimilarty(targetuser,user) for user in otherUsers]
similarSeries = pd.Series(similarlist,index=simuser_ID)
similarSeries.sort_values()[-10:]

39     0.374223
600    0.392173
19     0.396185
608    0.397421
368    0.406232
68     0.409680
428    0.439697
160    0.456960
217    0.461397
313    0.498476
dtype: float64