In [1]:
import pandas as pd
from UserBasedCF import UserBasedCF

# Preprocess

In [2]:
df_ratings = pd.read_csv("../data/ml-latest-small/test_ratings.csv")

df_ratings = df_ratings.groupby(["userId"]).agg({
    'movieId':lambda x : list(x),
    'rating':lambda x : list(x)
}).reset_index()

In [3]:
df_movies = pd.read_csv("../data/ml-latest-small/movies.csv")

movies_info = dict()

for row in df_movies.iterrows():
    movies_info[row[1]["movieId"]] = "\"" + row[1]["title"] + "\"," + row[1]["genres"]

# Suggest movie for a single user

In [None]:
def get_list_movie(uid, df):
    cur_df = df[df["userId"] == uid]
    movies = list(cur_df["movieId"])[0]
    rating = list(cur_df["rating"])[0]
    return movies, rating


def get_same_movies(movies1, movies2):
    movies1 = set(movies1)
    result = []
    for x in movies2:
        if x in movies1:
            result.append(x)
    return result


def build_dict_rating(movies, ratings):
    result = dict()
    for i, val in enumerate(movies):
        result[val] = ratings[i]
    return result


def calc_mean_rating(ratings):
    return sum(ratings) / len(ratings)


def calc_std(ratings, mean_rating):
    if len(ratings) < 1:
        return 1
    
    if len(ratings) < 2:
        return ratings[0] - mean_rating
    
    return (sum((r - mean_rating)**2 for r in ratings) / (len(ratings) - 1))**0.5


def calc_z_score(rating, mean_rating, std_rating):
    return (rating - mean_rating) / std_rating


def calc_similarity(u_uid, df, use_discount_sim=False, discount_sim_threshold=5):
    u_movies, u_ratings = get_list_movie(u_uid, df)
    u_rating_dict = build_dict_rating(u_movies, u_ratings)
    u_mean_rating = calc_mean_rating(u_ratings)
    
    for row in df.iterrows():
        if row[1]["userId"] != u_uid:
            v_uid = row[1]["userId"]
            v_movies, v_ratings = get_list_movie(v_uid, df)
            v_rating_dict = build_dict_rating(v_movies, v_ratings)
            v_mean_rating = calc_mean_rating(v_ratings)
            
            same_movies = get_same_movies(u_movies, v_movies)
            
            if len(same_movies) <= 0:
                continue
            
            up_value = 0
            for m in same_movies:
                up_value += (u_rating_dict[m] - u_mean_rating) * (v_rating_dict[m] - v_mean_rating)
                
            down_value1 = 0
            for m in same_movies:
                down_value1 += (u_rating_dict[m] - u_mean_rating)**2
            down_value1 = down_value1**0.5
            
            down_value2 = 0
            for m in same_movies:
                down_value2 += (v_rating_dict[m] - v_mean_rating)**2
            down_value2 = down_value2**0.5
            
            if down_value1 * down_value2 == 0:
                continue
            
            similarity_value = up_value / (down_value1 * down_value2)
            
            if use_discount_sim:
                similarity_value *= min(len(same_movies), discount_sim_threshold) / discount_sim_threshold

            yield similarity_value, len(same_movies), row[1]
            
            
def suggest_new_movie(u_uid, df, top_k=50, 
                      use_discount_sim=False, 
                      discount_sim_threshold=5,
                      use_z_score=False
                     ):
    similarities = list(x for x in calc_similarity(u_uid, df_ratings, use_discount_sim, discount_sim_threshold) if x[0] > 0)
    similarities = sorted(similarities, key=lambda x:x[0], reverse=True)
    
    u_movies, u_ratings = get_list_movie(u_uid, df)
    u_rating_dict = build_dict_rating(u_movies, u_ratings)
    u_mean_rating = calc_mean_rating(u_ratings)
    u_std = calc_std(u_ratings, u_mean_rating)
    
    all_movies = dict()
    for val in similarities[0:top_k]:
        for x in val[2]["movieId"]:
            all_movies[x] = 0
    
    for movie in all_movies.keys():
        have_rated = [x for x in similarities[0:top_k] if movie in x[2]["movieId"]]
        
        top_value = 0
        down_value = 0
        
        if not use_z_score:
            for v_user in have_rated:
                v_movies, v_ratings = get_list_movie(v_user[2]["userId"], df)
                v_rating_dict = build_dict_rating(v_movies, v_ratings)
                v_mean_rating = calc_mean_rating(v_ratings)

                sim_u_v = v_user[0]
                top_value += sim_u_v * (v_rating_dict[movie] - v_mean_rating)
                down_value += sim_u_v
        
            if down_value > 0:
                all_movies[movie] = u_mean_rating + top_value / down_value
        else:
            for v_user in have_rated:
                v_movies, v_ratings = get_list_movie(v_user[2]["userId"], df)
                v_rating_dict = build_dict_rating(v_movies, v_ratings)
                v_mean_rating = calc_mean_rating(v_ratings)
                v_std = calc_std(v_ratings, v_mean_rating)
                
                sim_u_v = v_user[0]
                top_value += sim_u_v * calc_z_score(v_rating_dict[movie], v_mean_rating, v_std)
                down_value += sim_u_v
            
            if down_value > 0:
                all_movies[movie] = u_mean_rating + u_std * top_value / down_value
        
    all_movies = [(movie, all_movies[movie]) for movie in sorted(all_movies, key=all_movies.get, reverse=True)]
    return all_movies

In [None]:
def show_result(uid, suggest_movies, 
                top_k = 10, 
                df_ratings=df_ratings, 
                df_movies=df_movies,
                file_path="../data/show_result.csv"
               ):
    f = open(file_path, "w")
    
    movies, ratings = get_list_movie(uid, df_ratings)
    print("Watched movies: ")
    for m, r in zip(movies, ratings):
        print(movies_info[m], "rate: ", r)
        f.write(str(m) + "," + movies_info[m] + "," + str(r) + "\n")
        
    print("Suggest movies: ")
    for m, s in suggest_movies[:top_k]:
        print(movies_info[m], "scores: ", s)
        f.write(str(m) + "," + movies_info[m] + "," + str(s) + "\n")

In [None]:
similarities = list(x for x in calc_similarity(3, df_ratings) if x[0] > 0)
similarities = sorted(similarities, key=lambda x:x[0], reverse=True)

In [None]:
for val in similarities:
    print(val)

In [4]:
userbased_cf = UserBasedCF(df_ratings, df_movies)

In [5]:
similarities = list(x for x in userbased_cf.calc_similarity(3) if x[0] > 0)
similarities = sorted(similarities, key=lambda x:x[0], reverse=True)

In [6]:
for val in similarities:
    print(val)

(0.9384742644069303, 3, userId                             2
movieId              [1, 2, 4, 5, 6]
rating     [6.0, 7.0, 4.0, 3.0, 4.0]
Name: 1, dtype: object)
(0.8944271909999159, 4, userId                                  1
movieId                [1, 2, 3, 4, 5, 6]
rating     [7.0, 6.0, 7.0, 4.0, 5.0, 4.0]
Name: 0, dtype: object)
