In [137]:
import pandas as pd

In [138]:
df_ratings = pd.read_csv("../data/ml-latest-small/ratings.csv")

In [139]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


# Preprocess

In [140]:
df_ratings = df_ratings.groupby(["userId"]).agg({
    'movieId':lambda x : list(x),
    'rating':lambda x : list(x)
}).reset_index()

In [141]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,"[1, 3, 6, 47, 50, 70, 101, 110, 151, 157, 163,...","[4.0, 4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 5.0, ..."
1,2,"[318, 333, 1704, 3578, 6874, 8798, 46970, 4851...","[3.0, 4.0, 4.5, 4.0, 4.0, 3.5, 4.0, 4.0, 4.5, ..."
2,3,"[31, 527, 647, 688, 720, 849, 914, 1093, 1124,...","[0.5, 0.5, 0.5, 0.5, 0.5, 5.0, 0.5, 0.5, 0.5, ..."
3,4,"[21, 32, 45, 47, 52, 58, 106, 125, 126, 162, 1...","[3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 4.0, 5.0, 1.0, ..."
4,5,"[1, 21, 34, 36, 39, 50, 58, 110, 150, 153, 232...","[4.0, 4.0, 4.0, 4.0, 3.0, 4.0, 5.0, 4.0, 3.0, ..."


# Suggest movie for a single user

In [190]:
def get_list_movie(uid, df):
    cur_df = df[df["userId"] == uid]
    movies = list(cur_df["movieId"])[0]
    rating = list(cur_df["rating"])[0]
    return movies, rating


def get_same_movies(movies1, movies2):
    movies1 = set(movies1)
    result = []
    for x in movies2:
        if x in movies1:
            result.append(x)
    return result


def build_dict_rating(movies, ratings):
    result = dict()
    for i, val in enumerate(movies):
        result[val] = ratings[i]
    return result


def calc_mean_rating(ratings):
    return sum(ratings) / len(ratings)


def calc_similarity(u_uid, df):
    u_movies, u_ratings = get_list_movie(u_uid, df)
    u_rating_dict = build_dict_rating(u_movies, u_ratings)
    u_mean_rating = calc_mean_rating(u_ratings)
    
    for row in df.iterrows():
        if row[1]["userId"] != u_uid:
            v_uid = row[1]["userId"]
            v_movies, v_ratings = get_list_movie(v_uid, df)
            v_rating_dict = build_dict_rating(v_movies, v_ratings)
            v_mean_rating = calc_mean_rating(v_ratings)
            
            same_movies = get_same_movies(u_movies, v_movies)
            
            if len(same_movies) <= 0:
                continue
            
            up_value = 0
            for m in same_movies:
                up_value += (u_rating_dict[m] - u_mean_rating) * (v_rating_dict[m] - v_mean_rating)
                
            down_value1 = 0
            for m in same_movies:
                down_value1 += (u_rating_dict[m] - u_mean_rating)**2
            down_value1 = down_value1**0.5
            
            down_value2 = 0
            for m in same_movies:
                down_value2 += (v_rating_dict[m] - v_mean_rating)**2
            down_value2 = down_value2**0.5
            
            if down_value1 * down_value2 == 0:
                continue
            
            similarity_value = up_value / (down_value1 * down_value2)

            yield similarity_value, len(same_movies), row[1]
            
def suggest_new_movie(u_uid, df, top_k=50):
    similarities = list(x for x in calc_similarity(u_uid, df_ratings) if x[0] > 0)
    similarities = sorted(similarities, key=lambda x:x[0], reverse=True)
    
    all_movies = set()
    for val in similarities[0, top_k]:
        all_movies.add(val[2]["movieId"])
    
    return all_movies

In [191]:
top_similarities = suggest_new_movie(1, df_ratings)

TypeError: list indices must be integers or slices, not tuple

In [178]:
for i, val in enumerate(top_similarities):
    if i < 10:
        print(val)
    else:
        break

userId                                                    77
movieId    [260, 1196, 1198, 1210, 2571, 3578, 3948, 3996...
rating     [5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 3.0, 5.0, 2.5, ...
Name: 76, dtype: object
userId                                                    12
movieId    [39, 168, 222, 256, 261, 277, 357, 543, 830, 8...
rating     [4.0, 5.0, 5.0, 5.0, 4.5, 3.0, 3.5, 3.5, 4.0, ...
Name: 11, dtype: object
userId                                                    85
movieId    [53, 314, 515, 527, 538, 1140, 1183, 1185, 120...
rating     [5.0, 5.0, 4.0, 5.0, 4.0, 5.0, 4.0, 5.0, 5.0, ...
Name: 84, dtype: object
userId                                                   253
movieId    [29, 446, 608, 714, 916, 926, 955, 965, 1192, ...
rating     [4.5, 5.0, 5.0, 4.5, 2.5, 4.5, 4.5, 3.5, 4.0, ...
Name: 252, dtype: object
userId                                                   291
movieId    [1, 4896, 4993, 5218, 5816, 7153, 8368, 40815,...
rating     [4.0, 5.0, 3.0, 4.0, 5.0, 3.0, 5.0, 5.