In [1]:
import pandas as pd
from typing import List
import numpy as np

Objective: find ratings of users for each movie

## Genres

In [2]:
movie_feats = pd.read_csv("../dataset/movie/ml-latest-small/movies.csv", header=0, index_col=0)
movie_feats.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [3]:
# encode genres using one hot vector
genres_set = set()
for e in movie_feats['genres']:
    tokens = e.strip().split('|')
    for token in tokens:
        genres_set.add(token)
genre2code = { x:i for i,x in enumerate(list(genres_set)) }
genre2code

def encode_genre(genres: str):
    tokens = genres.strip().split("|")
    res = np.zeros(len(genres_set))
    for token in tokens:
        idx = genre2code[token]
        res[idx] = 1
    return res

def encode_wrapper(row):
    return encode_genre(row['genres'])

movie_feats['genre_one_hot'] = movie_feats.apply(encode_wrapper, axis=1) 
movie_feats.head()

Unnamed: 0_level_0,title,genres,genre_one_hot
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
2,Jumanji (1995),Adventure|Children|Fantasy,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
3,Grumpier Old Men (1995),Comedy|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
5,Father of the Bride Part II (1995),Comedy,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."


In [4]:
print("Total number of genres:", len(genres_set))

Total number of genres: 20


In [5]:
movie_feats.loc[5]  # use loc to access row by movieId

title                           Father of the Bride Part II (1995)
genres                                                      Comedy
genre_one_hot    [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
Name: 5, dtype: object

## Ratings

In [6]:
ratings = pd.read_csv("../dataset/movie/ml-latest-small/ratings.csv")
ratings.head()

# multiindex
# ratings.loc[(47, 1)]   # use bracket instead of parenthesis to use .loc method

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
users = ratings['userId'].unique()
movies = ratings['movieId'].unique()
print("Size of users:", len(users))
print("Size of movies:", len(movies))

user_map = {e:i for i, e in enumerate(users)}
movie_map = {e:i for i, e in enumerate(movies)}

Size of users: 610
Size of movies: 9724


In [8]:
m = len(users)
n = len(movies)
M = np.zeros(shape=(m, n))
for _, row in ratings.iterrows():
    user_id, movie_id, rating, _ = row.values
    i, j = user_map[user_id], movie_map[movie_id]
    assert 0 <= rating <= 10
    M[i, j] = rating

In [9]:
with open("../dataset/movie/user_by_movies_small_rating.npy", "wb") as wrt:
    np.save(wrt, M)

In [10]:
# data series
average_ratings = ratings.groupby('movieId').mean()['rating']
average_ratings.head()

movieId
1    3.920930
2    3.431818
3    3.259615
4    2.357143
5    3.071429
Name: rating, dtype: float64

## Tags

In [11]:
tags_df = pd.read_csv("../dataset/movie/ml-latest-small/tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [12]:
tags = tags_df['tag'].unique()
print("Number of tags:", len(tags))

Number of tags: 1589


In [13]:
tags_df[tags_df['movieId'] == 3]

Unnamed: 0,userId,movieId,tag,timestamp
561,289,3,moldy,1143424860
562,289,3,old,1143424860


In [14]:
movie_tags_df = pd.DataFrame( { 'all_tags': tags_df[['movieId', 'tag']].groupby('movieId')['tag'].apply("|".join) } )
movie_tags_df.head()

Unnamed: 0_level_0,all_tags
movieId,Unnamed: 1_level_1
1,pixar|pixar|fun
2,fantasy|magic board game|Robin Williams|game
3,moldy|old
5,pregnancy|remake
7,remake


In [16]:
tag2code = {x:i for i,x in enumerate(tags)}

def encode_tag(all_tags: str):
    tokens = all_tags.strip().split("|")
    res = np.zeros(len(tags))
    for token in tokens:
        idx = tag2code[token]
        res[idx] = np.random.random()
    return res

def encode_wrapper_tag(x):
    return encode_tag(x['all_tags'])

movie_tags_df['all_tags_one_hot'] = movie_tags_df.apply(encode_wrapper_tag, axis=1) 
movie_tags_df.head()

Unnamed: 0_level_0,all_tags,all_tags_one_hot
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,pixar|pixar|fun,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,fantasy|magic board game|Robin Williams|game,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,moldy|old,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,pregnancy|remake,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,remake,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [17]:
tag2code['pixar']

439

In [18]:
movie_tags_df.loc[1]['all_tags_one_hot'][439]

0.926214831922351

## Ground Set

In [117]:
# ground_set = list(movie_feats.index)

# k = len(ratings['movieId'].unique())
# print(k, k <= len(ground_set))
# print("Observation: there exists some movies without ratings.")

In [19]:
cand_a = set(movie_feats.index)
cand_b = set(ratings['movieId'].unique())
cand_c = set(movie_tags_df.index)
ground_set = sorted(list(cand_a.intersection(cand_b.intersection(cand_c))))
print("Size of ground set:", len(ground_set))

Size of ground set: 1554


# Objective and Cost

In [20]:
def shares_genre(left: int, right: int) -> bool:
    a = movie_feats.loc[left]['genre_one_hot']
    b = movie_feats.loc[right]['genre_one_hot']
    return np.logical_and(a, b).any()

print("True", shares_genre(1, 2))
print("False", shares_genre(2, 3))

True True
False False


In [21]:
def compute_similarity(left: int, right: int) -> float: 
    """
    parameters:
    - left: left movie id
    - right: right movie id

    returns:
    - similarity score between 2 movies
    """
    a = movie_tags_df.loc[left]['all_tags_one_hot']
    b = movie_tags_df.loc[right]['all_tags_one_hot']
    x = np.minimum(a, b)
    return np.sqrt((x * x).sum())

compute_similarity(1,3)

0.0

In [22]:
def objective(s: List[int], alpha: float = 0.5, beta: float = 0.5, llambda: float = 3, mu: float = 7) -> float:
    A = 0.
    for movie_id in s:
        avg_rating = average_ratings[movie_id]
        A += avg_rating
    A = alpha * A

    B = 0.
    for i in s:
        for j in ground_set:   # global variable
            similarity = compute_similarity(i, j) # weights is similarity, between i, j
            B += similarity
    B = beta * B

    C = 0.
    for i in s:
        for j in s:
            similarity = compute_similarity(i, j) # between i, j
            if shares_genre(i, j):
                t = llambda + mu
            else:
                t = llambda
            C += t * similarity
    C = beta * C

    return A + B - C

S = [1,2,3]
objective(S)

-4.325477194261731

Each movie is assigned a cost sampled uniformly from [0,1]

In [128]:
total_budget = 0.1  # from 0.01 to 0.1
cost = {
    i: np.random.random() for i in ground_set
}