In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse
import sklearn.model_selection

In [2]:
ml = pd.read_csv("ml-20m/ratings.csv", header=0,
                 dtype={"user_id": np.int32, "movie_id": np.int32, "rating": np.float32, "time": np.int64},
                 names=("user_id", "movie_id", "rating", "time"))

ml["time"] = pd.to_datetime(ml["time"], unit="s")

In [3]:
max_user  = int(ml["user_id"].max() + 1)
max_movie = int(ml["movie_id"].max() + 1)

In [4]:
names = pd.read_csv("ml-20m/movies.csv", header=0, encoding = "ISO-8859-1", index_col=0,
    names=("movie_id", "movie_title"), usecols=[0,1])

In [5]:
movie_group = ml.groupby("movie_id")
movie_stats = names.join(movie_group.size().rename("num_ratings")).join(movie_group.mean()["rating"]. \
                                                                        rename("avg_rating"))

In [6]:
ml_train, ml_test = sklearn.model_selection.train_test_split(ml, test_size=0.25, random_state=123456789)

In [7]:
def df2mat(df):
    m = scipy.sparse.coo_matrix((df["rating"], (df["user_id"], df["movie_id"])),
                                shape=(max_user, max_movie),
                                dtype=np.float32).tocsc()
    return m, m > 0

ml_mat_train, ml_mask_train = df2mat(ml_train)
ml_mat_test,  ml_mask_test  = df2mat(ml_test)

In [8]:
target_user = 28812
names.merge(ml_train[ml_train.user_id == target_user], right_on="movie_id", left_index=True)

Unnamed: 0,movie_title,user_id,movie_id,rating,time
4229885,Heat (1995),28812,6,4.0,1996-09-23 02:11:00
4229886,GoldenEye (1995),28812,10,5.0,1996-09-23 02:03:57
4229887,Ace Ventura: When Nature Calls (1995),28812,19,4.0,1996-09-23 02:05:59
4229889,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),28812,32,5.0,1996-09-23 02:06:49
4229891,Clueless (1995),28812,39,4.0,1996-09-23 02:06:22
4229892,"Usual Suspects, The (1995)",28812,50,3.0,1996-09-23 02:05:59
4229893,Braveheart (1995),28812,110,5.0,1996-09-23 02:03:57
4229894,"Birdcage, The (1996)",28812,141,5.0,1996-09-23 02:08:39
4229895,Apollo 13 (1995),28812,150,5.0,1996-09-23 02:11:00
4229896,Rob Roy (1995),28812,151,4.0,1996-09-23 02:11:00


In [9]:
target_movie = 317

In [10]:
users_df = ml_train[ml_train.user_id == target_user][["movie_id"]]. \
    merge(ml_train, on="movie_id")[["movie_id", "user_id", "rating"]]. \
    merge(ml_train[ml_train.movie_id == target_movie], on="user_id"). \
    drop(["movie_id_y", "time"], axis=1)

In [11]:
users_df = users_df.assign(rating_dev = users_df.rating_y - users_df.rating_x)

In [12]:
rating_dev = users_df.groupby("movie_id_x").mean()["rating_dev"]
names.join(rating_dev, how="inner").sort_values("rating_dev")

Unnamed: 0,movie_title,rating_dev
527,Schindler's List (1993),-1.389430
50,"Usual Suspects, The (1995)",-1.348821
110,Braveheart (1995),-1.077912
1036,Die Hard (1988),-1.013024
356,Forrest Gump (1994),-0.986838
457,"Fugitive, The (1993)",-0.948168
296,Pulp Fiction (1994),-0.930971
32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),-0.892933
589,Terminator 2: Judgment Day (1991),-0.860920
6,Heat (1995),-0.853178


In [13]:
df = ml_train[ml_train.user_id == target_user].join(rating_dev, on="movie_id")
df = df.assign(rating_adj = df["rating"] + df["rating_dev"])[["user_id", "movie_id", "rating", "rating_adj"]]
df.join(names, on="movie_id").sort_values("movie_title")

Unnamed: 0,user_id,movie_id,rating,rating_adj,movie_title
4229920,28812,344,3.0,3.134933,Ace Ventura: Pet Detective (1994)
4229887,28812,19,4.0,4.481400,Ace Ventura: When Nature Calls (1995)
4229948,28812,588,4.0,3.427220,Aladdin (1992)
4229895,28812,150,5.0,4.176845,Apollo 13 (1995)
4229951,28812,592,4.0,3.694755,Batman (1989)
4229897,28812,153,4.0,4.119300,Batman Forever (1995)
4229953,28812,595,4.0,3.443493,Beauty and the Beast (1991)
4229931,28812,420,5.0,5.348524,Beverly Hills Cop III (1994)
4229894,28812,141,5.0,4.478425,"Birdcage, The (1996)"
4229893,28812,110,5.0,3.922088,Braveheart (1995)


In [14]:
df["rating_adj"].mean()

4.080007076263428

In [17]:
num_ratings = users_df.groupby("movie_id_x").count()["rating_dev"].rename("num_ratings")
names.join(num_ratings, how="inner").sort_values("num_ratings")

Unnamed: 0,movie_title,num_ratings
786,Eraser (1996),2007
151,Rob Roy (1995),2304
6,Heat (1995),2360
315,"Specialist, The (1994)",2425
282,Nell (1994),2620
236,French Kiss (1995),2863
1036,Die Hard (1988),2956
173,Judge Dredd (1995),3107
474,In the Line of Fire (1993),3141
225,Disclosure (1994),3144


In [18]:
df = df.join(num_ratings, on="movie_id")
df = df.assign(rating_weighted = df["rating_adj"] * df["num_ratings"])

In [19]:
df["rating_weighted"].sum() / df["num_ratings"].sum()

4.0317084636356775