In [1]:
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install scipy
!pip install tensorflow



In [2]:
!ls

main.ipynb  ratings.dat			title.basics.tsv   users.dat
movies.dat  recommander_system_project	title.ratings.tsv


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [4]:
# parse users from movielens

user_data = {'id': [], 'gender': [], 'age': [], 'occupation': [], 'zip': []}

with open('users.dat', 'r') as users:
    for line in users:
        data = line.rstrip().split('::')
        user_data['id'].append(data[0])
        user_data['gender'].append(data[1])
        user_data['age'].append(data[2])
        user_data['occupation'].append(data[3])
        user_data['zip'].append(data[4])
        
user_data = pd.DataFrame(data=user_data)
        

In [7]:
occupations = ["other","academic/educator","artist","clerical/admin","college/grad student",
               "customer service","doctor/health care","executive/managerial","farmer","homemaker",
               "K-12 student","lawyer","programmer","retired","sales/marketing","scientist","self-employed",
               "technician/engineer","tradesman/craftsman","unemployed","writer"]

In [8]:
user_data['occupation'] = user_data['occupation'].apply(lambda x: occupations[int(x)])

In [9]:
user_data

Unnamed: 0,id,gender,age,occupation,zip
0,1,F,1,K-12 student,48067
1,2,M,56,self-employed,70072
2,3,M,25,scientist,55117
3,4,M,45,executive/managerial,02460
4,5,M,25,writer,55455
...,...,...,...,...,...
6035,6036,F,25,scientist,32603
6036,6037,F,45,academic/educator,76006
6037,6038,F,56,academic/educator,14706
6038,6039,F,45,other,01060


In [10]:
# parse movies from movielens

movie_data = {'id': [], 'title': [], 'genres': []}

with open('movies.dat', 'r', encoding = "ISO-8859-1") as movies:
    for line in movies:
        data = line.rstrip().split('::')
        movie_data['id'].append(data[0])
        movie_data['title'].append(data[1])
        movie_data['genres'].append(data[2])
        
movie_data = pd.DataFrame(data=movie_data)

In [12]:
movie_data['genres'] = movie_data['genres'].apply(lambda x: x.split('|'))

In [13]:
movie_data

Unnamed: 0,id,title,genres
0,1,Toy Story (1995),"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
3878,3948,Meet the Parents (2000),[Comedy]
3879,3949,Requiem for a Dream (2000),[Drama]
3880,3950,Tigerland (2000),[Drama]
3881,3951,Two Family House (2000),[Drama]


In [14]:
genres = ["Action","Adventure","Animation","Children's","Comedy","Crime","Documentary",
          "Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi",
          "Thriller","War","Western"]

for g in genres:
    movie_data.insert(len(movie_data.columns),g,0)


In [16]:
for i in range(len(movie_data)):
    data = movie_data.iloc[i]
    for g in data['genres']:
        movie_data.loc[i, g] = 1

In [17]:
movie_data

Unnamed: 0,id,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),"[Animation, Children's, Comedy]",0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]",0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama]",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),[Comedy],0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),[Drama],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),[Drama],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),[Drama],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
movie_data.drop(columns=['genres'], inplace=True)

In [19]:
# load IMDb data

imdb_movies = pd.read_csv('title.basics.tsv', sep='\t', na_values='\\N', low_memory=False)

In [21]:
# select only movies

imdb_movies = imdb_movies[imdb_movies['titleType'] == 'movie'].copy()

In [23]:
# drop useless columns

imdb_movies.drop(columns=['endYear','titleType', 'originalTitle'], inplace=True)

In [25]:
# specify types

imdb_movies.tconst = imdb_movies.tconst.astype(str)
imdb_movies.primaryTitle = imdb_movies.primaryTitle.astype(str)
imdb_movies.runtimeMinutes = imdb_movies.runtimeMinutes.astype(float)
imdb_movies.genres = imdb_movies.genres.astype(str)

In [26]:
# process the title to match the two datasets

imdb_movies['title_lc'] = imdb_movies['primaryTitle'].apply(lambda x: x.lower())

movie_data['title_lc'] = movie_data['title'].apply(lambda x: x[:-6].rstrip().lower())

In [28]:
imdb_movies.drop_duplicates(subset='title_lc', inplace=True)
imdb_movies.dropna(inplace=True)

movie_data.drop_duplicates(subset='title_lc', inplace=True)

In [30]:
merged = movie_data.merge(imdb_movies, on='title_lc')

In [31]:
merged

Unnamed: 0,id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Thriller,War,Western,title_lc,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,...,0,0,0,toy story,tt0114709,Toy Story,0.0,1995.0,81.0,"Adventure,Animation,Comedy"
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,jumanji,tt0113497,Jumanji,0.0,1995.0,104.0,"Adventure,Comedy,Family"
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,grumpier old men,tt0113228,Grumpier Old Men,0.0,1995.0,101.0,"Comedy,Romance"
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,waiting to exhale,tt0114885,Waiting to Exhale,0.0,1995.0,124.0,"Comedy,Drama,Romance"
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,father of the bride part ii,tt0113041,Father of the Bride Part II,0.0,1995.0,106.0,"Comedy,Family,Romance"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,3946,Get Carter (2000),1,0,0,0,0,0,0,1,...,1,0,0,get carter,tt0067128,Get Carter,0.0,1971.0,112.0,"Action,Crime,Thriller"
2543,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,...,0,0,0,meet the parents,tt0104844,Meet the Parents,0.0,1992.0,75.0,Comedy
2544,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,...,0,0,0,requiem for a dream,tt0180093,Requiem for a Dream,0.0,2000.0,102.0,Drama
2545,3950,Tigerland (2000),0,0,0,0,0,0,0,1,...,0,0,0,tigerland,tt0170691,Tigerland,0.0,2000.0,101.0,"Drama,War"


In [32]:
# include genres from IMDb datasets

merged.genres = merged.genres.apply(lambda x: np.array(x.split(',')))
imdb_genres = merged.genres.values.flatten()
new_genres = np.unique(np.array([x for l in imdb_genres for x in l]))

In [35]:
for g in new_genres:
    if g not in movie_data.drop(columns=['id','title', 'title_lc']).columns.values:
        merged.insert(20,g,0)
        
for i in range(len(merged)):
    data = merged.iloc[i]
    for d in data.genres:
        merged.loc[i,d] = 1

In [36]:
merged.drop(columns='genres', inplace=True)

In [37]:
merged

Unnamed: 0,id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,History,Family,Biography,Adult,title_lc,tconst,primaryTitle,isAdult,startYear,runtimeMinutes
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,toy story,tt0114709,Toy Story,0.0,1995.0,81.0
1,2,Jumanji (1995),0,1,0,1,1,0,0,0,...,0,1,0,0,jumanji,tt0113497,Jumanji,0.0,1995.0,104.0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,grumpier old men,tt0113228,Grumpier Old Men,0.0,1995.0,101.0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,waiting to exhale,tt0114885,Waiting to Exhale,0.0,1995.0,124.0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,1,0,0,father of the bride part ii,tt0113041,Father of the Bride Part II,0.0,1995.0,106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,3946,Get Carter (2000),1,0,0,0,0,1,0,1,...,0,0,0,0,get carter,tt0067128,Get Carter,0.0,1971.0,112.0
2543,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,...,0,0,0,0,meet the parents,tt0104844,Meet the Parents,0.0,1992.0,75.0
2544,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,...,0,0,0,0,requiem for a dream,tt0180093,Requiem for a Dream,0.0,2000.0,102.0
2545,3950,Tigerland (2000),0,0,0,0,0,0,0,1,...,0,0,0,0,tigerland,tt0170691,Tigerland,0.0,2000.0,101.0


In [38]:
# parse ratings from IMDb to add as features

imdb_ratings = pd.read_csv('title.ratings.tsv', sep='\t', na_values='\\N')

imdb_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2062
1,tt0000002,5.6,279
2,tt0000003,6.5,2030
3,tt0000004,5.4,180
4,tt0000005,6.2,2797
...,...,...,...
1452676,tt9916730,7.0,12
1452677,tt9916766,7.1,23
1452678,tt9916778,7.2,37
1452679,tt9916840,7.2,10


In [40]:
imdb_ratings.tconst = imdb_ratings.tconst.astype(str)

In [41]:
merged = merged.merge(imdb_ratings, on='tconst')
merged

Unnamed: 0,id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Biography,Adult,title_lc,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,averageRating,numVotes
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,toy story,tt0114709,Toy Story,0.0,1995.0,81.0,8.3,1076040
1,2,Jumanji (1995),0,1,0,1,1,0,0,0,...,0,0,jumanji,tt0113497,Jumanji,0.0,1995.0,104.0,7.1,380028
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,grumpier old men,tt0113228,Grumpier Old Men,0.0,1995.0,101.0,6.6,29870
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,waiting to exhale,tt0114885,Waiting to Exhale,0.0,1995.0,124.0,6.0,12309
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,father of the bride part ii,tt0113041,Father of the Bride Part II,0.0,1995.0,106.0,6.1,41937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513,3946,Get Carter (2000),1,0,0,0,0,1,0,1,...,0,0,get carter,tt0067128,Get Carter,0.0,1971.0,112.0,7.3,37005
2514,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,...,0,0,meet the parents,tt0104844,Meet the Parents,0.0,1992.0,75.0,6.7,243
2515,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,...,0,0,requiem for a dream,tt0180093,Requiem for a Dream,0.0,2000.0,102.0,8.3,904200
2516,3950,Tigerland (2000),0,0,0,0,0,0,0,1,...,0,0,tigerland,tt0170691,Tigerland,0.0,2000.0,101.0,6.9,43499


In [44]:
merged.id = merged.id.astype(int)
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518 entries, 0 to 2517
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2518 non-null   int64  
 1   title           2518 non-null   object 
 2   Action          2518 non-null   int64  
 3   Adventure       2518 non-null   int64  
 4   Animation       2518 non-null   int64  
 5   Children's      2518 non-null   int64  
 6   Comedy          2518 non-null   int64  
 7   Crime           2518 non-null   int64  
 8   Documentary     2518 non-null   int64  
 9   Drama           2518 non-null   int64  
 10  Fantasy         2518 non-null   int64  
 11  Film-Noir       2518 non-null   int64  
 12  Horror          2518 non-null   int64  
 13  Musical         2518 non-null   int64  
 14  Mystery         2518 non-null   int64  
 15  Romance         2518 non-null   int64  
 16  Sci-Fi          2518 non-null   int64  
 17  Thriller        2518 non-null   i

In [46]:
# parse movie Lens ratings

ratings_data = {'uid': [], 'mid': [], 'rating': [], 'timestamp': []}

with open('ratings.dat', 'r') as ratings:
    for line in ratings:
        data = line.rstrip().split('::')
        ratings_data['uid'].append(data[0])
        ratings_data['mid'].append(data[1])
        ratings_data['rating'].append(data[2])
        ratings_data['timestamp'].append(data[3])
        
ratings_data = pd.DataFrame(data=ratings_data)


In [47]:
ratings_data

Unnamed: 0,uid,mid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [49]:
ratings_data.drop(columns='timestamp', inplace=True)

In [50]:
ratings_data.uid = ratings_data.uid.astype(int)
ratings_data.mid = ratings_data.mid.astype(int)
ratings_data.rating = ratings_data.rating.astype(float)

In [52]:
# keep only ratings from movies present in the set 

ratings_data = ratings_data.merge(merged, right_on='id', left_on='mid')[['uid','mid','rating']]

Unnamed: 0,uid,mid,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,1287,5.0
...,...,...,...
703141,6040,1090,3.0
703142,6040,1091,1.0
703143,6040,562,5.0
703144,6040,1096,4.0


In [53]:
new_uids = ratings_data.groupby('uid').count()

Unnamed: 0_level_0,mid,rating
uid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,40,40
2,91,91
3,33,33
4,16,16
5,133,133
...,...,...
6036,600,600
6037,135,135
6038,13,13
6039,82,82


In [54]:
# after removing the movies, keep only users with more than 10 reviews
new_uids = new_uids[new_uids['rating'] >= 10]['rating']

uid
1        40
2        91
3        33
4        16
5       133
       ... 
6036    600
6037    135
6038     13
6039     82
6040    228
Name: rating, Length: 6034, dtype: int64

In [55]:
ratings_data = ratings_data[ratings_data['uid'].isin(new_uids.index)]

Unnamed: 0,uid,mid,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,1287,5.0
...,...,...,...
703141,6040,1090,3.0
703142,6040,1091,1.0
703143,6040,562,5.0
703144,6040,1096,4.0


In [58]:
# remove movies without ratings 

merged = merged[merged['id'].isin(ratings_data['mid'].values)].copy()
merged

Unnamed: 0,id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Biography,Adult,title_lc,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,averageRating,numVotes
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,toy story,tt0114709,Toy Story,0.0,1995.0,81.0,8.3,1076040
1,2,Jumanji (1995),0,1,0,1,1,0,0,0,...,0,0,jumanji,tt0113497,Jumanji,0.0,1995.0,104.0,7.1,380028
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,grumpier old men,tt0113228,Grumpier Old Men,0.0,1995.0,101.0,6.6,29870
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,waiting to exhale,tt0114885,Waiting to Exhale,0.0,1995.0,124.0,6.0,12309
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,father of the bride part ii,tt0113041,Father of the Bride Part II,0.0,1995.0,106.0,6.1,41937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2513,3946,Get Carter (2000),1,0,0,0,0,1,0,1,...,0,0,get carter,tt0067128,Get Carter,0.0,1971.0,112.0,7.3,37005
2514,3948,Meet the Parents (2000),0,0,0,0,1,0,0,0,...,0,0,meet the parents,tt0104844,Meet the Parents,0.0,1992.0,75.0,6.7,243
2515,3949,Requiem for a Dream (2000),0,0,0,0,0,0,0,1,...,0,0,requiem for a dream,tt0180093,Requiem for a Dream,0.0,2000.0,102.0,8.3,904200
2516,3950,Tigerland (2000),0,0,0,0,0,0,0,1,...,0,0,tigerland,tt0170691,Tigerland,0.0,2000.0,101.0,6.9,43499


In [119]:
ratings_data

Unnamed: 0,uid,mid,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,1287,5.0
...,...,...,...
703141,6040,1090,3.0
703142,6040,1091,1.0
703143,6040,562,5.0
703144,6040,1096,4.0


In [129]:
# create test set

# select 500 users with the more ratings

best_users = ratings_data.groupby('uid')['rating'].count().sort_values(ascending=False).index[:500]

best_users

Index([4169, 1680, 4277, 1941, 1181,  889, 2063, 3618, 1150, 5795,
       ...
       4161, 3313, 2436, 4950,  476, 1758, 3469,  796, 3716, 3847],
      dtype='int64', name='uid', length=500)

In [133]:
ratings_data

Unnamed: 0,uid,mid,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,1287,5.0
...,...,...,...
703141,6040,1090,3.0
703142,6040,1091,1.0
703143,6040,562,5.0
703144,6040,1096,4.0


In [137]:
ratings_data.groupby('uid')['rating'].count().sort_values(ascending=False).head(100)

uid
4169    1527
1680    1295
4277    1165
1941    1120
1181    1039
        ... 
5256     594
5333     593
4238     592
1733     591
4085     591
Name: rating, Length: 100, dtype: int64

In [130]:
ratings_data.pivot(index='mid', columns='uid', values='rating').fillna(0)

uid,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
def find_test_couples(ratings_data, best_users, test_size = 20, threshold = 20):
    couples = []
    added = []

    pivot_ratings = ratings_data.pivot(index='mid', columns='uid', values='rating').fillna(0)
    
    pivot_index = pivot_ratings.columns.values

    rating_mask = np.where(pivot_ratings.values == 0, pivot_ratings.values, 1)

    for i in range(len(best_users)):
        for j in range(len(best_users)):
            if i == j or best_users[i] in added or best_users[j] in added:
                continue
            u_i_mask = rating_mask[:, np.where(pivot_index == best_users[i])[0][0]]
            u_j_mask = rating_mask[:, np.where(pivot_index == best_users[j])[0][0]]
            common = np.multiply(u_i_mask, u_j_mask).sum()

            if common >= threshold:
                couples.append((best_users[i], best_users[j]))
                added.append(best_users[i])
                added.append(best_users[j])
                break
        if len(couples) >= test_size:
            return couples
    return couples
            
couples = find_test_couples(ratings_data, best_users)
print(couples, len(couples))

[(4169, 1680), (4277, 1941), (1181, 889), (2063, 3618), (1150, 5795), (1980, 4344), (3391, 1449), (1015, 2909), (4227, 4510), (5831, 4508), (424, 3808), (3841, 5367), (549, 1088), (1285, 3224), (3539, 4543), (1448, 4725), (3032, 524), (1010, 752), (678, 3526), (5643, 2116)] 20


In [59]:
## prepare data for learning recommendations

# prepare X: movie feature matrix

movie_id_table = merged.id.values
X = merged.drop(columns=['id', 'title', 'title_lc', 'tconst', 'primaryTitle']).copy()
for c in X.columns:
    X[c] = X[c].astype(float)
    # normalize
    cmax = X[c].max()
    cmin = X[c].min()
    if cmax > 1:
        X[c] = (X[c] - cmin) / (cmax - cmin)
        
X = X.values       

array([[0.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.58914729e-01, 8.97058824e-01, 4.58952450e-01],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        2.48062016e-01, 7.20588235e-01, 1.62087013e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.36434109e-01, 6.47058824e-01, 1.27364166e-02],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.40310078e-01, 8.97058824e-01, 3.85658661e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.36434109e-01, 6.91176471e-01, 1.85495047e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.63565891e-01, 7.35294118e-01, 7.35752944e-04]])

In [60]:
def normalizeRatings(Y, R):
    Ymean = (np.sum(Y * R, axis=1) / (np.sum(R, axis=1) + 1e-12)).reshape(-1, 1)
    
    Ynorm = Y - np.multiply(Ymean, R)
    
    return (Ynorm, Ymean)

In [62]:
# prepare Y: ratings for users per movie

Y = ratings_data.pivot(index='mid', columns='uid', values='rating').fillna(0).values
Y

array([[5., 0., 0., ..., 0., 0., 3.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [63]:
# prepare R: binary-valued indicator matrix

R = np.where(Y == 0, Y, 1)
R

array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [64]:
# normalize ratings 

Ynorm, Ymean = normalizeRatings(Y, R)
Ynorm

array([[ 0.85315359,  0.        ,  0.        , ...,  0.        ,
         0.        , -1.14684641],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [67]:
# create loss function for collaborative filtering learning

def cofi_cost_func(X, W, b, Y, R, lambda_): # MSE cost function for collaborative filtering
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_ / 2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    
    return J

In [68]:
num_movies, num_users = Ynorm.shape
num_features = X.shape[1]

In [69]:
# initialize W and b randomly

W = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64), name="W")
b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name="b")

# instantiate optimizer
optim = tf.keras.optimizers.Adam(learning_rate=1e-2)

2024-07-05 16:05:58.908143: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-05 16:05:58.908821: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [70]:
X.shape, W.shape, b.shape, Ynorm.shape, R.shape

((2419, 30),
 TensorShape([6034, 30]),
 TensorShape([1, 6034]),
 (2419, 6034),
 (2419, 6034))

In [71]:
steps = 300
lambda_ = 1
for i in range(steps):
    with tf.GradientTape() as tape:
        loss = cofi_cost_func(X, W, b, Ynorm, R, lambda_)
        
    grads = tape.gradient(loss, [W,b])
    
    optim.apply_gradients(zip(grads, [W, b]))
    
    if i % 20 == 19:
        print('Step {}: {:.4f}'.format(i, loss))

Step 19: 1396833.0381
Step 39: 949022.7614
Step 59: 721620.6488
Step 79: 588609.7979
Step 99: 501922.3430
Step 119: 441476.3872
Step 139: 397521.8693
Step 159: 364644.3819
Step 179: 339537.1663
Step 199: 320047.3063
Step 219: 304710.0255
Step 239: 292496.8169
Step 259: 282668.2332
Step 279: 274682.6615
Step 299: 268137.2627


In [72]:
W2 = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64), name="W")
b2 = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name="b")

# instantiate optimizer
optim2 = tf.keras.optimizers.Adam(learning_rate=1e-2)


steps = 300
lambda_ = 0.1
for i in range(steps):
    with tf.GradientTape() as tape:
        loss = cofi_cost_func(X, W2, b2, Ynorm, R, lambda_)
        
    grads = tape.gradient(loss, [W2,b2])
    
    optim2.apply_gradients(zip(grads, [W2, b2]))
    
    if i % 20 == 19:
        print('Step {}: {:.4f}'.format(i, loss))

Step 19: 1304899.8195
Step 39: 879828.1480
Step 59: 667331.5294
Step 79: 544243.3391
Step 99: 464500.5328
Step 119: 409091.2097
Step 139: 368898.5760
Step 159: 338889.6415
Step 179: 315996.4614
Step 199: 298227.1673
Step 219: 284232.2521
Step 239: 273068.4103
Step 259: 264060.3350
Step 279: 256715.3798
Step 299: 250668.5128


In [73]:
W2, b2

(<tf.Variable 'W:0' shape=(6034, 30) dtype=float64, numpy=
 array([[ 3.30506880e-02,  1.66680343e-01,  5.22064370e-01, ...,
         -8.08693206e-02, -1.27081618e+00,  6.12872307e-02],
        [ 2.32402943e-01,  6.28720248e-02, -3.73756796e-08, ...,
          7.49770706e-01, -1.30104991e-01,  5.18713638e-01],
        [ 4.36404463e-01, -8.94791481e-02, -8.32894889e-01, ...,
          1.77898786e+00, -1.00965892e+00, -7.75162127e-01],
        ...,
        [-2.56651348e-01,  2.03765344e-01, -1.16700566e-03, ...,
         -1.55649319e+00, -1.81300117e+00, -6.71695065e-01],
        [ 1.24758515e-01, -1.38149304e-01, -9.40008530e-02, ...,
          3.43402827e-01, -9.17095259e-01,  8.56271635e-01],
        [ 2.35216354e-01, -6.20464149e-01, -6.60172749e-01, ...,
         -1.70561806e+00, -8.92191230e-02, -1.35359597e+00]])>,
 <tf.Variable 'b:0' shape=(1, 6034) dtype=float64, numpy=
 array([[ 0.57810324, -0.09357555,  0.86794409, ...,  1.19756772,
          0.45445084,  0.84341636]])>)

In [83]:
# predict ratings for all users

p = np.matmul(X, np.transpose(W2.numpy()) + b2.numpy())

# restore
pm = p + Ymean

# rescale between 0.1 and 5

pmin = pm.min()
pmax = pm.max()
pnorm = ((pm - pmin) / (pmax - pmin)) * 5

# compute predictions for non-rated movies
predictions = pd.DataFrame(data=(pnorm * (R - 1) * -1))
predictions 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6024,6025,6026,6027,6028,6029,6030,6031,6032,6033
0,-0.000000,2.428573,2.769872,2.778518,3.083008,-0.000000,2.862742,-0.000000,-0.000000,-0.000000,...,1.857420,-0.000000,1.767216,2.565391,-0.000000,2.949344,1.884378,3.055251,2.688597,-0.000000
1,2.682711,2.375409,2.840659,2.795625,2.884285,2.876171,2.853720,2.174358,2.723936,-0.000000,...,1.742774,1.398735,1.608880,2.516844,2.075698,2.814801,1.756238,3.098637,2.602473,2.809479
2,2.520572,2.338497,2.536635,2.568404,2.543823,2.652837,2.547632,2.322810,2.362305,2.457455,...,2.077204,1.948551,1.957853,2.353558,-0.000000,2.548809,2.053652,2.734443,2.467087,2.508584
3,2.625856,2.305434,2.604553,2.554556,2.601422,2.708418,2.598106,-0.000000,2.431779,2.435775,...,1.955603,1.757405,1.840260,2.518913,-0.000000,-0.000000,1.925830,2.837197,2.506506,2.568484
4,2.554603,2.415237,2.625303,2.721232,2.625749,2.777077,2.613338,2.287095,2.387216,2.444763,...,1.980833,1.720445,1.821894,2.394361,-0.000000,2.613085,1.995039,2.852832,2.495391,2.574512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2414,2.508348,2.197307,2.609544,2.510151,2.566354,2.647909,2.678580,2.210616,2.447696,2.369468,...,1.711146,1.428304,1.647017,2.496111,1.984010,2.571404,1.721916,2.751276,2.485529,2.716454
2415,2.482452,2.385699,2.589941,2.541549,2.541259,2.574205,2.584705,2.371702,-0.000000,-0.000000,...,2.159689,2.172736,2.142688,2.411864,2.275972,2.519404,2.195903,2.623164,2.485145,2.628242
2416,2.604081,2.452087,2.663497,2.610546,2.569390,2.605249,2.674960,2.427535,2.648929,2.529488,...,2.088653,2.102320,2.168875,2.637462,2.332062,2.704532,2.213809,2.641670,2.600349,2.689495
2417,2.664882,2.369488,2.771138,2.453103,2.472214,2.731261,2.712973,2.307797,2.609629,2.445270,...,1.867440,1.972806,1.952194,2.690512,2.307129,2.632198,2.124989,2.738333,2.543589,2.708787


In [84]:
predicted_ratings = predictions.stack().reset_index(name='rating').rename(columns={'level_0': 'p_movie_id', 'level_1': 'p_user_id'})
predicted_ratings['movie_id'] = predicted_ratings.p_movie_id.apply(lambda x: movie_id_table[x])
predicted_ratings

Unnamed: 0,p_movie_id,p_user_id,rating,movie_id
0,0,0,-0.000000,1
1,0,1,2.428573,1
2,0,2,2.769872,1
3,0,3,2.778518,1
4,0,4,3.083008,1
...,...,...,...,...
14596241,2418,6029,2.681795,3951
14596242,2418,6030,2.030217,3951
14596243,2418,6031,2.946403,3951
14596244,2418,6032,2.608785,3951


In [97]:
# select two users to compute couple scores

u1 = 0
u2 = 1

# create mask to select only movies that none of them has seen

mask = ((R[:,u1] + R[:,u2]) - 1) * -1
mask

array([-0.,  1.,  1., ...,  1.,  1.,  1.])

In [98]:
# select predicted values for both of users

p_u1 = np.multiply(predictions[u1].copy().values, mask) 
p_u2 = np.multiply(predictions[u2].copy().values, mask)

In [101]:
# compute score for couple by taking average of predictions

couple_score = (p_u1 + p_u2) / 2
couple_score

array([0.        , 2.5290597 , 2.42953464, ..., 2.52808415, 2.51718467,
       2.56752701])

In [103]:
p_couple_movie = couple_score.argmax()

In [105]:
movie_id_table[p_couple_movie]

3607

In [107]:
movie_data[movie_data['id'] == str(movie_id_table[p_couple_movie])]['title']

3538    One Little Indian (1973)
Name: title, dtype: object