## 5.4 類似度の計算

In [3]:
import json
import numpy as np

def euclidean_score(dataset, user1, user2):
    if user1 not in dataset:
        raise TypeError('Cannot find ' + user1 + ' in the dataset')
    if user2 not in dataset:
        raise TypeError('Cannot find ' + user2 + ' in the dataset')
    
    common_movies = {}
    for item in dataset[user1]:
        if item in dataset[user2]:
            common_movies[item] = 1
            
    if len(common_movies) == 0:
        return 0
    
    squared_diff = []
    for item in common_movies:
        squared_diff.append(np.square(dataset[user1][item] - 
                                     dataset[user2][item]))
        
    return 1 / (1 + np.sqrt(np.sum(squared_diff)))

In [6]:
def pearson_score(dataset, user1, user2):
    if user1 not in dataset:
        raise TypeError('Cannot find ' + user1 + ' in the dataset')
    if user2 not in dataset:
        raise TypeError('Cannot find ' + user2 + ' in the dataset')
    
    common_movies = {}
    for item in dataset[user1]:
        if item in dataset[user2]:
            common_movies[item] = 1
            
    num_rating = len(common_movies)
    
    if num_rating == 0:
        return 0
    
    user1_sum = np.sum([dataset[user1][item] for item in common_movies])
    user2_sum = np.sum([dataset[user2][item] for item in common_movies])
    
    user1_squared_sum = np.sum([np.square(dataset[user1][item]) for item in common_movies])
    user2_squared_sum = np.sum([np.square(dataset[user2][item]) for item in common_movies])
    
    sum_of_products = np.sum([dataset[user1][item] * dataset[user2][item] for item in common_movies])
    
    Sxy = sum_of_products - (user1_sum * user2_sum / num_rating)
    Sxx = user1_squared_sum - np.square(user1_sum) / num_rating
    Syy = user2_squared_sum - np.square(user2_sum) / num_rating
    
    if Sxx * Syy == 0:
        return 0
    
    return (Sxy / np.sqrt(Sxx * Syy))

In [10]:
rating_file = './data/ratings.json'

with open(rating_file, 'r') as f:
    data = json.loads(f.read())
    
user1 = 'Julie Hammel'
user2 = 'Chris Duncan'

print("Euclidean score:") 
print(euclidean_score(data, user1, user2))

print("Pearson score:") 
print(pearson_score(data, user1, user2)) 

Euclidean score:
0
Pearson score:
0
