In [1]:
!pip install pandas numpy tensorflow



In [2]:
!ls

main.ipynb  ratings.dat			title.basics.tsv   users.dat
movies.dat  recommander_system_project	title.ratings.tsv


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

2024-07-08 18:03:30.059636: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-08 18:03:30.061944: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-08 18:03:30.068425: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 18:03:30.080612: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 18:03:30.080631: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 18:03:30.089709: I tensorflow/core/platform/cpu_feature_guard.cc:

# Load an preprocess datasets

In [4]:
# parse movies from movielens

movie_data = {'id': [], 'title': [], 'genres': []}

with open('movies.dat', 'r', encoding = "ISO-8859-1") as movies:
    for line in movies:
        data = line.rstrip().split('::')
        movie_data['id'].append(data[0])
        movie_data['title'].append(data[1])
        movie_data['genres'].append(data[2])
        
movie_data = pd.DataFrame(data=movie_data)

In [5]:
movie_data['genres'] = movie_data['genres'].apply(lambda x: x.split('|'))

In [6]:
movie_data

Unnamed: 0,id,title,genres
0,1,Toy Story (1995),"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
3878,3948,Meet the Parents (2000),[Comedy]
3879,3949,Requiem for a Dream (2000),[Drama]
3880,3950,Tigerland (2000),[Drama]
3881,3951,Two Family House (2000),[Drama]


In [7]:
genres = ["Action","Adventure","Animation","Children's","Comedy","Crime","Documentary",
          "Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi",
          "Thriller","War","Western"]

for g in genres:
    movie_data.insert(len(movie_data.columns),g,0)


In [8]:
for i in range(len(movie_data)):
    data = movie_data.iloc[i]
    for g in data['genres']:
        movie_data.loc[i, g] = 1

In [9]:
movie_data

Unnamed: 0,id,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),"[Animation, Children's, Comedy]",0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]",0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama]",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),[Comedy],0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),[Drama],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),[Drama],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),[Drama],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
movie_data.drop(columns=['genres'], inplace=True)

In [11]:
# load IMDb data

imdb_movies = pd.read_csv('title.basics.tsv', sep='\t', na_values='\\N', low_memory=False)

In [12]:
# select only movies

imdb_movies = imdb_movies[imdb_movies['titleType'] == 'movie'].copy()

In [13]:
# drop useless columns

imdb_movies.drop(columns=['endYear','titleType', 'originalTitle'], inplace=True)

In [14]:
# specify types

imdb_movies.tconst = imdb_movies.tconst.astype(str)
imdb_movies.primaryTitle = imdb_movies.primaryTitle.astype(str)
imdb_movies.runtimeMinutes = imdb_movies.runtimeMinutes.astype(float)
imdb_movies.genres = imdb_movies.genres.astype(str)

In [15]:
# process the title to match the two datasets

imdb_movies['title_lc'] = imdb_movies['primaryTitle'].apply(lambda x: x.lower())

movie_data['title_lc'] = movie_data['title'].apply(lambda x: x[:-6].rstrip().lower())

In [16]:
imdb_movies.drop_duplicates(subset='title_lc', inplace=True)
imdb_movies.dropna(inplace=True)

movie_data.drop_duplicates(subset='title_lc', inplace=True)

In [17]:
# merge movies from MovieLens and IMDb
# this will be used as our movie feature matrix

merged = movie_data.merge(imdb_movies, on='title_lc')

In [18]:
merged

Unnamed: 0,id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Thriller,War,Western,title_lc,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,...,0,0,0,toy story,tt0114709,Toy Story,0.0,1995.0,81.0,"Adventure,Animation,Comedy"
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,jumanji,tt0113497,Jumanji,0.0,1995.0,104.0,"Adventure,Comedy,Family"
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,grumpier old men,tt0113228,Grumpier Old Men,0.0,1995.0,101.0,"Comedy,Romance"
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,waiting to exhale,tt0114885,Waiting to Exhale,0.0,1995.0,124.0,"Comedy,Drama,Romance"
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,father of the bride part ii,tt0113041,Father of the Bride Part II,0.0,1995.0,106.0,"Comedy,Family,Romance"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,3946,Get Carter (2000),1,0,0,0,0,0,0,1,...,1,0,0,get carter,tt0067128,Get Carter,0.0,1971.0,112.0,"Action,Crime,Thriller"
2543,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,...,0,0,0,meet the parents,tt0104844,Meet the Parents,0.0,1992.0,75.0,Comedy
2544,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,...,0,0,0,requiem for a dream,tt0180093,Requiem for a Dream,0.0,2000.0,102.0,Drama
2545,3950,Tigerland (2000),0,0,0,0,0,0,0,1,...,0,0,0,tigerland,tt0170691,Tigerland,0.0,2000.0,101.0,"Drama,War"


In [19]:
# include genres from IMDb datasets

merged.genres = merged.genres.apply(lambda x: np.array(x.split(',')))
imdb_genres = merged.genres.values.flatten()
new_genres = np.unique(np.array([x for l in imdb_genres for x in l]))

In [20]:
for g in new_genres:
    if g not in movie_data.drop(columns=['id','title', 'title_lc']).columns.values:
        merged.insert(20,g,0)
        
for i in range(len(merged)):
    data = merged.iloc[i]
    for d in data.genres:
        merged.loc[i,d] = 1

In [21]:
merged.drop(columns='genres', inplace=True)

In [22]:
merged

Unnamed: 0,id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,History,Family,Biography,Adult,title_lc,tconst,primaryTitle,isAdult,startYear,runtimeMinutes
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,toy story,tt0114709,Toy Story,0.0,1995.0,81.0
1,2,Jumanji (1995),0,1,0,1,1,0,0,0,...,0,1,0,0,jumanji,tt0113497,Jumanji,0.0,1995.0,104.0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,grumpier old men,tt0113228,Grumpier Old Men,0.0,1995.0,101.0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,waiting to exhale,tt0114885,Waiting to Exhale,0.0,1995.0,124.0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,1,0,0,father of the bride part ii,tt0113041,Father of the Bride Part II,0.0,1995.0,106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,3946,Get Carter (2000),1,0,0,0,0,1,0,1,...,0,0,0,0,get carter,tt0067128,Get Carter,0.0,1971.0,112.0
2543,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,...,0,0,0,0,meet the parents,tt0104844,Meet the Parents,0.0,1992.0,75.0
2544,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,...,0,0,0,0,requiem for a dream,tt0180093,Requiem for a Dream,0.0,2000.0,102.0
2545,3950,Tigerland (2000),0,0,0,0,0,0,0,1,...,0,0,0,0,tigerland,tt0170691,Tigerland,0.0,2000.0,101.0


In [23]:
# parse ratings from IMDb to add as features

imdb_ratings = pd.read_csv('title.ratings.tsv', sep='\t', na_values='\\N')

imdb_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2062
1,tt0000002,5.6,279
2,tt0000003,6.5,2030
3,tt0000004,5.4,180
4,tt0000005,6.2,2797
...,...,...,...
1452676,tt9916730,7.0,12
1452677,tt9916766,7.1,23
1452678,tt9916778,7.2,37
1452679,tt9916840,7.2,10


In [24]:
imdb_ratings.tconst = imdb_ratings.tconst.astype(str)

In [25]:
merged = merged.merge(imdb_ratings, on='tconst')
merged

Unnamed: 0,id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Biography,Adult,title_lc,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,averageRating,numVotes
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,toy story,tt0114709,Toy Story,0.0,1995.0,81.0,8.3,1076040
1,2,Jumanji (1995),0,1,0,1,1,0,0,0,...,0,0,jumanji,tt0113497,Jumanji,0.0,1995.0,104.0,7.1,380028
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,grumpier old men,tt0113228,Grumpier Old Men,0.0,1995.0,101.0,6.6,29870
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,waiting to exhale,tt0114885,Waiting to Exhale,0.0,1995.0,124.0,6.0,12309
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,father of the bride part ii,tt0113041,Father of the Bride Part II,0.0,1995.0,106.0,6.1,41937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513,3946,Get Carter (2000),1,0,0,0,0,1,0,1,...,0,0,get carter,tt0067128,Get Carter,0.0,1971.0,112.0,7.3,37005
2514,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,...,0,0,meet the parents,tt0104844,Meet the Parents,0.0,1992.0,75.0,6.7,243
2515,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,...,0,0,requiem for a dream,tt0180093,Requiem for a Dream,0.0,2000.0,102.0,8.3,904200
2516,3950,Tigerland (2000),0,0,0,0,0,0,0,1,...,0,0,tigerland,tt0170691,Tigerland,0.0,2000.0,101.0,6.9,43499


In [26]:
merged.id = merged.id.astype(int)
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518 entries, 0 to 2517
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2518 non-null   int64  
 1   title           2518 non-null   object 
 2   Action          2518 non-null   int64  
 3   Adventure       2518 non-null   int64  
 4   Animation       2518 non-null   int64  
 5   Children's      2518 non-null   int64  
 6   Comedy          2518 non-null   int64  
 7   Crime           2518 non-null   int64  
 8   Documentary     2518 non-null   int64  
 9   Drama           2518 non-null   int64  
 10  Fantasy         2518 non-null   int64  
 11  Film-Noir       2518 non-null   int64  
 12  Horror          2518 non-null   int64  
 13  Musical         2518 non-null   int64  
 14  Mystery         2518 non-null   int64  
 15  Romance         2518 non-null   int64  
 16  Sci-Fi          2518 non-null   int64  
 17  Thriller        2518 non-null   i

In [27]:
# parse movie Lens ratings

ratings_data = {'uid': [], 'mid': [], 'rating': [], 'timestamp': []}

with open('ratings.dat', 'r') as ratings:
    for line in ratings:
        data = line.rstrip().split('::')
        ratings_data['uid'].append(data[0])
        ratings_data['mid'].append(data[1])
        ratings_data['rating'].append(data[2])
        ratings_data['timestamp'].append(data[3])
        
ratings_data = pd.DataFrame(data=ratings_data)


In [28]:
ratings_data

Unnamed: 0,uid,mid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [29]:
ratings_data.drop(columns='timestamp', inplace=True)

In [30]:
ratings_data.uid = ratings_data.uid.astype(int)
ratings_data.mid = ratings_data.mid.astype(int)
ratings_data.rating = ratings_data.rating.astype(float)

In [31]:
# keep only ratings from movies present in the set 

ratings_data = ratings_data.merge(merged, right_on='id', left_on='mid')[['uid','mid','rating']]

In [32]:
new_uids = ratings_data.groupby('uid').count()

In [33]:
# after removing the movies, keep only users with more than 10 reviews
new_uids = new_uids[new_uids['rating'] >= 10]['rating']

In [34]:
ratings_data = ratings_data[ratings_data['uid'].isin(new_uids.index)]

In [35]:
# remove movies without ratings 

merged = merged[merged['id'].isin(ratings_data['mid'].values)].copy()
merged

Unnamed: 0,id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Biography,Adult,title_lc,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,averageRating,numVotes
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,toy story,tt0114709,Toy Story,0.0,1995.0,81.0,8.3,1076040
1,2,Jumanji (1995),0,1,0,1,1,0,0,0,...,0,0,jumanji,tt0113497,Jumanji,0.0,1995.0,104.0,7.1,380028
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,grumpier old men,tt0113228,Grumpier Old Men,0.0,1995.0,101.0,6.6,29870
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,waiting to exhale,tt0114885,Waiting to Exhale,0.0,1995.0,124.0,6.0,12309
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,father of the bride part ii,tt0113041,Father of the Bride Part II,0.0,1995.0,106.0,6.1,41937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513,3946,Get Carter (2000),1,0,0,0,0,1,0,1,...,0,0,get carter,tt0067128,Get Carter,0.0,1971.0,112.0,7.3,37005
2514,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,...,0,0,meet the parents,tt0104844,Meet the Parents,0.0,1992.0,75.0,6.7,243
2515,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,...,0,0,requiem for a dream,tt0180093,Requiem for a Dream,0.0,2000.0,102.0,8.3,904200
2516,3950,Tigerland (2000),0,0,0,0,0,0,0,1,...,0,0,tigerland,tt0170691,Tigerland,0.0,2000.0,101.0,6.9,43499


In [36]:
ratings_data

Unnamed: 0,uid,mid,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,1287,5.0
...,...,...,...
703141,6040,1090,3.0
703142,6040,1091,1.0
703143,6040,562,5.0
703144,6040,1096,4.0


In [37]:
# function to find `test_size` couples with more than `threshold` movie ratings in common

def find_test_couples(ratings_data, best_users, test_size = 20, threshold = 20):
    couples = []
    added = []
    masks = []

    pivot_ratings = ratings_data.pivot(index='mid', columns='uid', values='rating').fillna(0)
    
    pivot_index = pivot_ratings.columns.values

    rating_mask = np.where(pivot_ratings.values == 0, pivot_ratings.values, 1)

    for i in range(len(best_users)):
        for j in range(len(best_users)):
            if i == j or best_users[i] in added or best_users[j] in added:
                continue
            u_i_mask = rating_mask[:, np.where(pivot_index == best_users[i])[0][0]]
            u_j_mask = rating_mask[:, np.where(pivot_index == best_users[j])[0][0]]
            common_mask = np.multiply(u_i_mask, u_j_mask) 
            common = common_mask.sum()

            if common >= threshold:
                couples.append((best_users[i], best_users[j]))
                masks.append(common_mask)
                added.append(best_users[i])
                added.append(best_users[j])
                break
        if len(couples) >= test_size:
            return couples, masks
    return couples, masks

In [38]:
# function to mask ratings from test set, we find couples such that they have more than 2 * `nb_masked` movie ratings in common

def ratings_train_test_split(ratings_data, test_size=20, nb_masked=10):
    # select 1000 users with the most ratings
    best_users = ratings_data.groupby('uid')['rating'].count().sort_values(ascending=False).index[:1000]

    test_couples, test_masks = find_test_couples(ratings_data, best_users, test_size=test_size, threshold=nb_masked*2)

    data = ratings_data.pivot(index='mid', columns='uid', values='rating').fillna(0)

    test_couples_idx = [(np.where(data.columns.values == c1)[0][0], np.where(data.columns.values == c2)[0][0]) for c1, c2 in test_couples]

    test_couple_ratings = []
    test_couple_masked_movies = []

    for i, c in enumerate(test_couples):
        u1, u2 = c
        mask = test_masks[i]

        idxs = np.where(mask == 1)[0]
        masked_idxs = np.random.choice(idxs, size=nb_masked, replace=False)

        u1_ratings = data.loc[:,u1].values
        u1_masked_ratings = np.array([u1_ratings[j] for j in masked_idxs])
        u2_ratings = data.loc[:,u2].values
        u2_masked_ratings = np.array([u2_ratings[j] for j in masked_idxs])

        couple_ratings = (u1_masked_ratings + u2_masked_ratings) / 2
        test_couple_ratings.append(couple_ratings)
        test_couple_masked_movies.append(masked_idxs)

        for r in masked_idxs:
            data.iloc[r].loc[u1] = 0.
            data.iloc[r].loc[u2] = 0.


    return data, data.values, test_couples, test_couples_idx, test_couple_ratings, test_couple_masked_movies


train_df, train_data, test_couples, test_couples_idx, test_ratings, test_movies_idx = ratings_train_test_split(ratings_data, test_size=100, nb_masked=100)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data.iloc[r].loc[u1] = 0.
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, becau

In [39]:
## prepare data for learning recommendations

# prepare X: movie feature matrix

movie_id_table = merged.id.values
X = merged.drop(columns=['id', 'title', 'title_lc', 'tconst', 'primaryTitle']).copy()
for c in X.columns:
    X[c] = X[c].astype(float)
    # normalize
    cmax = X[c].max()
    cmin = X[c].min()
    if cmax > 1:
        X[c] = (X[c] - cmin) / (cmax - cmin)
        
X = X.values

In [40]:
def normalizeRatings(Y, R):
    Ymean = (np.sum(Y * R, axis=1) / (np.sum(R, axis=1) + 1e-12)).reshape(-1, 1)
    
    Ynorm = Y - np.multiply(Ymean, R)
    
    return (Ynorm, Ymean)

In [41]:
# prepare Y: ratings for users per movie

Y = train_data
Y

array([[5., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [42]:
# prepare R: binary-valued indicator matrix

R = np.where(Y == 0, Y, 1)
R

array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
# normalize ratings 

Ynorm, Ymean = normalizeRatings(Y, R)
Ynorm

array([[ 0.86038481,  0.        ,  0.        , ...,  0.        ,
         0.        , -1.13961519],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [44]:
# create loss function for collaborative filtering learning

def cofi_cost_func(X, W, b, Y, R, lambda_): # MSE cost function for collaborative filtering
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_ / 2) * (tf.reduce_sum(W**2)) # adapted the function because X is not learnable
    
    return J

In [45]:
num_movies, num_users = Ynorm.shape
num_features = X.shape[1]

In [46]:
# initialize W and b randomly

W = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64), name="W")
b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name="b")

# instantiate optimizer
optim = tf.keras.optimizers.Adam(learning_rate=1e-1)

2024-07-08 18:03:57.875652: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-08 18:03:57.876044: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [47]:
X.shape, W.shape, b.shape, Ynorm.shape, R.shape

((2419, 30),
 TensorShape([6036, 30]),
 TensorShape([1, 6036]),
 (2419, 6036),
 (2419, 6036))

In [48]:
# Learn the Variables W and b

steps = 300
lambda_ = 0.1
for i in range(steps):
    with tf.GradientTape() as tape:
        loss = cofi_cost_func(X, W, b, Ynorm, R, lambda_)
        
    grads = tape.gradient(loss, [W,b])
    
    optim.apply_gradients(zip(grads, [W, b]))
    
    if i % 20 == 19:
        print('Step {}: {:.4f}'.format(i, loss))

Step 19: 349662.8803
Step 39: 238706.3066
Step 59: 216324.3389
Step 79: 209700.1638
Step 99: 206690.0849
Step 119: 204905.3679
Step 139: 203700.0438
Step 159: 202828.1851
Step 179: 202170.3004
Step 199: 201659.4948
Step 219: 201253.8752
Step 239: 200926.4857
Step 259: 200658.7237
Step 279: 200437.1818
Step 299: 200252.1586


In [49]:
# predict ratings for all users

p = np.matmul(X, np.transpose(W.numpy()) + b.numpy())

# restore
pm = p + Ymean

# rescale between 1 and 5

pmin = pm.min()
pmax = pm.max()
pnorm = ((pm - pmin) / (pmax - pmin)) * 4 + 1

# compute predictions for non-rated movies
predictions = pd.DataFrame(data=(pnorm * (R - 1) * -1))
predictions 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6026,6027,6028,6029,6030,6031,6032,6033,6034,6035
0,-0.000000,3.292728,3.218838,3.492739,3.184441,-0.000000,2.855102,-0.000000,-0.000000,-0.000000,...,2.649762,-0.000000,2.558454,2.994671,-0.000000,2.968335,2.947094,2.845975,3.192169,-0.000000
1,2.936154,3.328463,3.223050,3.514591,3.043966,3.123692,2.792011,2.749196,3.103655,-0.000000,...,2.588407,2.520179,2.440655,2.940932,2.597363,2.846756,2.907179,2.831078,3.144597,2.706482
2,2.942829,3.101815,3.049714,3.232777,2.980261,3.089812,2.850049,2.913082,3.005592,3.114362,...,2.826161,2.830262,2.743356,2.953804,-0.000000,2.906624,2.934207,2.874669,3.059114,2.811822
3,2.934113,3.134268,3.056630,3.250443,2.950959,3.075203,2.815895,-0.000000,3.017562,3.141636,...,2.767158,2.751365,2.698503,2.921443,-0.000000,-0.000000,2.903457,2.896937,3.083137,2.779395
4,2.843083,3.230967,3.088278,3.347574,2.982743,3.137187,2.813368,2.879094,3.022708,3.159809,...,2.765795,2.770300,2.667432,2.952668,-0.000000,2.881393,2.954038,2.850812,3.073293,2.726030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2414,2.884295,3.115195,3.150290,3.305309,2.872758,3.047276,2.817812,2.824188,3.002456,3.076548,...,2.581238,2.522057,2.546066,2.882685,2.586044,2.828587,2.816274,2.758312,3.092293,2.743958
2415,2.998472,3.078658,3.099986,3.185736,3.026471,3.089342,2.961337,2.956817,-0.000000,-0.000000,...,2.869168,2.931081,2.840796,2.970018,2.867107,2.959914,2.970359,2.903513,3.064926,2.915399
2416,3.068854,3.155151,3.141957,3.231513,3.026596,3.067572,2.963003,3.006382,3.107451,3.134824,...,2.846540,2.884061,2.893779,3.016814,2.872937,3.009858,3.007354,2.982920,3.132316,2.947427
2417,2.989467,3.131685,3.170885,3.182798,2.982974,3.139280,2.945408,2.899785,3.097429,3.122127,...,2.686092,2.827912,2.744791,3.003757,2.811660,2.958542,2.986226,2.903322,3.104069,2.876149


In [50]:
predictions = predictions.values

In [51]:
# evaluate on test set

mse = 0

for i, c in enumerate(test_couples_idx):
    ui1, ui2 = c
    target = test_ratings[i]
    masked_movies = test_movies_idx[i]

    u1_preds = np.array([predictions[j, ui1] for j in masked_movies])
    u2_preds = np.array([predictions[j, ui2] for j in masked_movies])

    couple_preds = (u1_preds + u2_preds) / 2

    mse += ((target - couple_preds)**2).sum() / len(target)

mse /= len(test_couples_idx)

print(mse)

0.9073271887163423


In [52]:
# return max rating and list of movie idxs with that rating 

def fav_movie(couple_ratings):
    m = 0
    favs = []

    for i, r in enumerate(couple_ratings):
        if r > m:
            m = r
            favs = [i]
        elif r == m:
            favs.append(i)
    return m, np.array(favs)

In [53]:
# check if one of item from list1 is in list2

def double_isin(list1, list2):
    for i in list1:
        if i in list2:
            return 1
    return 0

In [54]:
# compare predictions with test set for recommended movie

fav_rating_error = 0
fav_movie_acc = 0
verbose = True
for i, c in enumerate(test_couples_idx):
    ui1, ui2 = c
    target = test_ratings[i]
    masked_movies = test_movies_idx[i]

    fav_rating, fav_movies = fav_movie(target)

    u1_preds = np.array([predictions[j, ui1] for j in masked_movies])
    u2_preds = np.array([predictions[j, ui2] for j in masked_movies])

    couple_preds = (u1_preds + u2_preds) / 2
    pred_rating, pred_movies = fav_movie(couple_preds)
    fav_rating_error += (fav_rating - pred_rating)**2
    found = double_isin(fav_movies, pred_movies)
    fav_movie_acc += found

    if verbose:
        print('Couple {}: Favorite movie ratings: target={} | pred={} \n Favorite movie(s):\n target={}\n pred={}\n Found={}\n'.format(
            i, fav_rating, pred_rating, fav_movies, pred_movies, found == 1))

fav_rating_error /= len(test_couples_idx)
fav_movie_acc /= len(test_couples_idx)

print(f'Favorite movie rating MSE: {fav_rating_error}, Accuracy: {fav_movie_acc}')

Couple 0: Favorite movie ratings: target=5.0 | pred=3.1466669429746776 
 Favorite movie(s):
 target=[22 33 75 81 91]
 pred=[19]
 Found=False

Couple 1: Favorite movie ratings: target=5.0 | pred=3.274650380678274 
 Favorite movie(s):
 target=[ 1  5 22 23 80 93 97]
 pred=[99]
 Found=False

Couple 2: Favorite movie ratings: target=5.0 | pred=3.0146810977151848 
 Favorite movie(s):
 target=[22 38 56]
 pred=[79]
 Found=False

Couple 3: Favorite movie ratings: target=4.5 | pred=3.2322185893134074 
 Favorite movie(s):
 target=[43 52]
 pred=[8]
 Found=False

Couple 4: Favorite movie ratings: target=5.0 | pred=2.9951339997780444 
 Favorite movie(s):
 target=[10 67]
 pred=[29]
 Found=False

Couple 5: Favorite movie ratings: target=5.0 | pred=2.9698199515805017 
 Favorite movie(s):
 target=[42 44 47 48 55 74 79 80 82 93]
 pred=[18]
 Found=False

Couple 6: Favorite movie ratings: target=5.0 | pred=2.9558680568505147 
 Favorite movie(s):
 target=[1]
 pred=[24]
 Found=False

Couple 7: Favorite movie