In [1]:
import csv
from functools import reduce
import math
import json

In [2]:
class User: # пользователь
    name = ''
    movies_rates = {}
    movies_by_places = {}
    movies_by_days = {}
    avg_rate = 0
    sim_users = {}

In [3]:
def parse_movie_names(row):
    row = list(map(lambda name: name.strip(), row))
    return list(filter(lambda name: name != '', row))

In [4]:
def load_data(filename):
    data_file = open(filename)
    reader = csv.reader(data_file)
    
    movie_names = parse_movie_names(next(reader))
    users_data = {}
    
    for row in reader:
        users_data[row[0]] = dict(zip(movie_names, [value.strip() for value in row[1:]]))
        
    return users_data

In [5]:
def load_users():
    users = {}
    users_rates = load_data('./data.csv')
    for user, movie_rate in users_rates.items():
        for movie_name, rate in movie_rate.items():
            movie_rate[movie_name] = int(rate)
            
    users_days = load_data('./context_day.csv')
    users_places = load_data('./context_place.csv')
    
    for user_name in users_rates.keys():
        user = User()
        user.name = user_name
        user.movies_rates = users_rates[user_name]
        user.movies_by_days = users_days[user_name]
        user.movies_by_places = users_places[user_name]
        
        users[user_name] = user
        
    return users

In [6]:
def calculate_avg_rate(user): # считаем среднее
    valid_rates = list(filter(lambda rate: rate != -1, user.movies_rates.values()))
    return reduce(lambda summ, rate: summ + rate, valid_rates, 0) / len(valid_rates)

In [7]:
def calculate_and_set_avg(users): # считаем среднее
    for user in users.values():
        user.avg_rate = calculate_avg_rate(user)

In [8]:
def compute_and_set_metrics(users): # составляем метрику сходства
    for first_user in users.values(): 
        first_user.sim_users = {first_user: 0}
        for second_user in users.values():
            if first_user.name == second_user.name:
                continue
            s = 0
            sq1 = 0
            sq2 = 0
            for movie_name in first_user.movies_rates.keys():
                if first_user.movies_rates[movie_name] != -1 and second_user.movies_rates[movie_name] != -1:
                    s += first_user.movies_rates[movie_name] * second_user.movies_rates[movie_name]
                    sq1 += math.pow(first_user.movies_rates[movie_name], 2)
                    sq2 += math.pow(second_user.movies_rates[movie_name], 2)
            sq1 = math.sqrt(sq1)
            sq2 = math.sqrt(sq2)
            sq = sq1 * sq2
            first_user.sim_users[second_user] = s / (sq) if sq != 0 else 0 

In [9]:
def compute_missed_rates(user, k):
    sorted_sim_users = s = [u[0] for u in sorted(user.sim_users.items(), key=lambda d: d[1], reverse=True)]
    result = {}
    
    print
    for (movie_name, movie_rate) in user.movies_rates.items():
        if (movie_rate != -1):
            continue
        
        summ1 = 0
        summ2 = 0
        for other_user in sorted_sim_users[:k]:
            if(other_user.movies_rates[movie_name] != -1):
                summ1 += user.sim_users[other_user] * (other_user.movies_rates[movie_name] - other_user.avg_rate)
                summ2 += user.sim_users[other_user]
                
        result[movie_name] = round(user.avg_rate + summ1/summ2, 3)
    return result

In [10]:
def calculate_movies_popularity(users, places, days):
    movies_by_popularity = {}
    for user in users.values():
        for movie_name in user.movies_rates.keys():
            if movie_name not in movies_by_popularity:
                movies_by_popularity[movie_name] = 0

            movies_by_popularity[movie_name] += 1 if user.movies_by_places[movie_name] in places else 0
            movies_by_popularity[movie_name] += 1 if user.movies_by_days[movie_name] in days else 0
    return movies_by_popularity

In [11]:
def recommend_movie(users, target_user):
    movies_popularity = calculate_movies_popularity(users, ['h'], ['Sun', 'Sat'])
    unwatched_movies = compute_missed_rates(target_user, 4)

    
    sorted_movies_by_popularity = [u[0] for u in sorted(movies_popularity.items(), key=lambda d: d[1], reverse=True)]

    for movie_name in sorted_movies_by_popularity:
        if movie_name in unwatched_movies.keys() and unwatched_movies[movie_name] > 2.0:
            return movie_name
    return None

In [20]:
def print_to_file(recommended_movie, missed_rates, target_user):
    predictions = []
    for movie_name, movie_rate in missed_rates.items():
        predictions.append ({
        'movie': movie_name ,
        'rate': movie_rate
        })        
        
    result = {
        'user_name': target_user.name,
        'predictions': predictions,
        'recommended_movie': recommended_movie
    }
    with open('result.json', 'w') as outfile:
            json.dump(result, outfile, indent=2);

In [13]:
def print_to_console(recommended_movie, missed_rates):
    print('Рекомендован ', recommended_movie)
    for movie_name, movie_rate in missed_rates.items():
        print(movie_name, movie_rate)

In [21]:
users = load_users()
target_user = users['User 15']

calculate_and_set_avg(users)
compute_and_set_metrics(users)

recommended_movie = recommend_movie(users, target_user)
missed_rates = compute_missed_rates(target_user, 4)

print_to_console(recommended_movie, missed_rates)
print_to_file(recommended_movie, missed_rates, target_user)

Рекомендован  Movie 27
Movie 6 2.898
Movie 11 3.308
Movie 12 3.308
Movie 20 2.299
Movie 27 2.918
Movie 28 2.284
