# MovieLens and Recommender System

### System I: Recommendation Based on Genres

In [1]:
import numpy as np
import pandas as pd
import streamlit as st


In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Load the data

In [3]:
ratings = pd.read_csv('ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

movies = pd.read_csv('movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

users = pd.read_csv(f'users.dat', sep='::', engine = 'python', header = None)
users.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zipcode']
    

In [None]:
rating_merged = ratings.merge(movies, left_on = 'MovieID', right_on = 'MovieID')
#rating_merged


### How recommendations are generated by Genre?

* We are going to recommend movies based on highly-rated by users.

* `How to handle movies with multiple genres?` We are denormalizing genre + Movie ID. By doing so, that Movie becomes a recommendation candidate / recall set of both the Genres.

* `A scenario to be addressed:` How do we come up with a score that has Rating & Number of Ratings embedded inside.

* Our idea is, to keep the Rating schema simple. Add a dimnishing factor. And this can shrink the Movies with less ratings more. And perform less shrinkage on movies with more ratings.

* Here is a scheme, we are coming with: 
    * (`avg_rating_of_the_movie * rating_count_of_the_movie` + `min_rating_of_all_movies * avg_rating_count_of_all_movies`) / (`rating_count_of_the_movie + avg_rating_count_of_all_movies`)


#### Weighted Rating computed for each Movie

In [None]:
movie_rating = rating_merged[['MovieID', 'Rating']].groupby("MovieID").agg(['mean', 'count']).droplevel(0, axis=1).reset_index()

movie_rating.rename(columns={"mean": "Rating", "count": "Rating_count"}, inplace=True)


avg_rating_count = movie_rating['Rating_count'].mean() 
#avg_rating = (movie_rating['Rating'] * movie_rating['Rating_count']).sum() / movie_rating['Rating_count'].sum()
#avg_rating = 2.5
avg_rating = movie_rating['Rating'].min()

movie_rating['Weighted_Rating'] = (movie_rating['Rating'] * movie_rating['Rating_count'] + avg_rating * avg_rating_count)  / (movie_rating['Rating_count'] + avg_rating_count)



#### Data frame is built that has Movies with Genres & Weighted Rating

In [None]:
movie_with_rating = movies.join(movie_rating.set_index('MovieID'), how='left', on="MovieID")

movie_with_rating['Weighted_Rating'].fillna(value=avg_rating, inplace=True)

#movie_with_rating.sort_values(by='Rating_count', ascending=False)[0:30]

In [None]:
genre_movie_ratings = movie_with_rating.copy()
genre_movie_ratings['Genres'] = genre_movie_ratings['Genres'].str.split('|')
genre_movie_ratings = genre_movie_ratings.explode('Genres')
#genre_movie_ratings

### Find movie by genre

In [None]:
def get_all_genre():
    genres = genre_movie_ratings['Genres'].unique()
    
    return genres

In [None]:
def find_top_movies_by_genre(genre, n=10):
    top_movies = genre_movie_ratings[genre_movie_ratings['Genres'] == genre]


    top_movies = top_movies.sort_values(by='Weighted_Rating', ascending=False)
    
    top_movies = top_movies[0:n]
    return top_movies



In [None]:
get_all_genre()

In [None]:
find_top_movies_by_genre(genre='Drama', n=10)

### System II: Recommendation Based on IBCF

In [None]:
def get_random_movie_set(n=10):
    movie_set = movies.sample(n)
    return movie_set

get_random_movie_set()

#### Building Similarity Matrix

In [None]:
rating_matrix = ratings.pivot_table(index="UserID", columns="MovieID", values="Rating")


In [4]:
rating_matrix = pd.read_csv('Rmat.csv', sep=',')
rating_matrix

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,5.0,,,,,,,,,,...,,,,,,,,,,
u10,5.0,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,5.0,,,,,,,,,,...,,,,,,,,,,
u1001,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,4.0,,,,,,,,,,...,,,,,,,,,,3.0
u997,4.0,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


In [None]:
#rating_matrix.subtract(rating_matrix_1)
rating_matrix = rating_matrix_1
#rating_matrix.reset_index().shape

In [5]:

normalized_rating_matrix = rating_matrix.subtract(rating_matrix.mean(axis=1), axis='rows')
#np.sum(np.abs(normalized_rating_matrix.sum(axis=1)))
normalized_rating_matrix

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,0.811321,,,,,,,,,,...,,,,,,,,,,
u10,0.885287,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,0.869048,,,,,,,,,,...,,,,,,,,,,
u1001,0.347480,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,0.064189,,,,,,,,,,...,,,,,,,,,,-0.935811
u997,0.066667,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


In [6]:
cardinality_df = (~normalized_rating_matrix.isna()).astype('int')
cardinality_df = cardinality_df.T
cardinality_matrix = cardinality_df @ cardinality_df.T

In [7]:
cardinality_matrix

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,2077,538,82,10,5,84,61,110,58,155,...,26,33,98,13,5,254,161,17,55,168
m10,538,888,61,7,1,69,73,69,35,96,...,14,39,68,14,2,97,163,7,41,117
m100,82,61,128,7,0,35,16,11,15,15,...,5,13,34,6,5,52,36,8,18,38
m1000,10,7,7,20,0,8,2,1,1,4,...,2,1,4,4,0,6,7,1,5,13
m1002,5,1,0,0,8,0,0,0,0,1,...,0,0,1,0,0,4,0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,254,97,52,6,4,28,15,17,21,32,...,17,10,62,5,7,450,40,20,19,74
m996,161,163,36,7,0,32,49,25,15,37,...,4,28,34,9,2,40,256,6,27,54
m997,17,7,8,1,2,6,4,3,6,7,...,3,5,10,1,3,20,6,28,5,12
m998,55,41,18,5,0,18,14,10,5,15,...,2,12,15,7,1,19,27,5,93,26


In [8]:
#tmp_normalized_rating_matrix = normalized_rating_matrix.iloc[:, np.array([0, 1, 2])]
#normalized_rating_matrix = normalized_rating_matrix_copy.copy()

normalized_rating_matrix = normalized_rating_matrix.T

normalized_rating_matrix = normalized_rating_matrix.fillna(0)

nr = normalized_rating_matrix @ normalized_rating_matrix.T

# squared_normalized_rating_matrix = (normalized_rating_matrix * normalized_rating_matrix).sum(axis=1)
# squared_normalized_rating_matrix = squared_normalized_rating_matrix.to_numpy()
# dr = squared_normalized_rating_matrix[:, np.newaxis] * squared_normalized_rating_matrix[np.newaxis, :]

# cosine_distance = nr/dr
# S = (1 + cosine_distance)/2
# S


In [None]:
nr = normalized_rating_matrix @ normalized_rating_matrix.T
nr

In [9]:
squared_normalized_rating_matrix = ((normalized_rating_matrix**2) @ (normalized_rating_matrix!=0).T)
squared_normalized_rating_matrix = squared_normalized_rating_matrix.apply(np.vectorize(np.sqrt))
dr = squared_normalized_rating_matrix * squared_normalized_rating_matrix.T
dr


Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,1871.891032,481.565710,96.566395,10.957691,3.477780,74.995205,88.422339,173.977590,57.766520,178.793497,...,22.112591,50.052667,83.242518,21.391616,4.393274,227.704900,193.629200,12.596150,63.222283,163.423415
m10,481.565710,598.200863,45.174307,7.955042,0.230846,42.409774,69.329457,86.236839,32.830076,73.146688,...,16.286048,37.108371,46.743177,14.464802,0.117601,78.816251,147.989362,3.738282,35.546649,94.427950
m100,96.566395,45.174307,114.872873,6.318550,0.000000,30.583450,19.497153,14.626793,13.381206,8.301652,...,2.426787,9.707075,28.064333,2.875587,5.260325,51.440602,30.001040,6.687956,14.067279,26.804093
m1000,10.957691,7.955042,6.318550,27.458205,0.000000,5.025072,0.340745,0.201028,0.317686,1.555847,...,0.260613,0.356381,6.579619,1.855511,0.000000,4.712979,9.583789,0.077676,8.974744,14.708633
m1002,3.477780,0.230846,0.000000,0.000000,7.546158,0.000000,0.000000,0.000000,0.000000,0.035402,...,0.000000,0.000000,2.230509,0.000000,0.000000,3.604345,0.000000,1.754205,0.000000,1.101652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,227.704900,78.816251,51.440602,4.712979,3.604345,27.383256,20.620758,24.786587,19.958524,35.654654,...,6.299996,14.580884,49.026087,6.952145,3.797143,364.785389,58.223248,14.459647,20.418602,62.275465
m996,193.629200,147.989362,30.001040,9.583789,0.000000,25.193010,66.879271,35.295959,16.549241,24.066097,...,1.955870,27.145553,35.846602,16.041091,0.298619,58.223248,343.955767,4.322783,33.830257,42.334510
m997,12.596150,3.738282,6.687956,0.077676,1.754205,2.759161,3.138366,4.541824,3.853255,6.653916,...,1.853593,1.263986,9.397914,0.153003,1.712675,14.459647,4.322783,17.943322,1.855654,6.676834
m998,63.222283,35.546649,14.067279,8.974744,0.000000,13.373792,20.496043,13.627495,2.320365,17.971608,...,0.771360,12.443936,17.344639,10.127480,0.116647,20.418602,33.830257,1.855654,109.730235,29.307658


In [None]:
squared_normalized_rating_matrix.iloc[0:10,0:10]

In [None]:
squared_normalized_rating_matrix.T.iloc[0:10,0:10]

In [None]:
(squared_normalized_rating_matrix * squared_normalized_rating_matrix.T).iloc[0:10,0:10]

In [None]:
(squared_normalized_rating_matrix.values * squared_normalized_rating_matrix.T.values)[0:10,0:10]

In [10]:
nr_copy = nr.copy()
nr_copy[cardinality_matrix<3] = None

dr_copy = dr.copy()
dr_copy[cardinality_matrix<3] = None

S = (1 + nr_copy/dr_copy)/2

#S[S>=0.99] = None
np.fill_diagonal(S.values, np.nan)
S[cardinality_matrix<3] = None
#S[cardinality_matrix<3] = None

#S[S.rank(axis=1, method='max', ascending=False)>30] = None

# cosine_distance = nr/dr
# S = (1 + cosine_distance)/2
# S
-np.sort(-S.to_numpy())[0, 1:40]
np.argsort(-S.to_numpy())[0, 1:40]

array([1520, 1319, 3012, 3462, 2755, 2380, 1134, 3277, 2379, 2597, 2193,
       1527, 3040, 2130,  884, 3382,  663, 1829,  293, 2429, 1826, 3231,
        649,  851,  144, 2983, 3533, 3302, 3547, 1339, 2859, 3574,  154,
       1969,  510, 3384,   64, 2541, 3452])

In [None]:
idx = np.array(["m1", "m10", "m100", "m1510", "m260", "m3212"])
S.loc[idx, idx]

In [None]:
-np.sort(-S.to_numpy())[:, 0:20]

#np.argsort(-S.to_numpy())[:, 0:5]

In [None]:
print(np.nonzero(normalized_rating_matrix.iloc[1]))
print(np.nonzero(normalized_rating_matrix.iloc[0]))

np.nonzero(normalized_rating_matrix.iloc[0] * normalized_rating_matrix.iloc[1])

In [None]:
#idx = np.array([   0, 51, 2381, 2394,  302, 2417])
idx = np.array([1520, 1319, 3012, 3462, 2755, 2380, 1134, 3277, 2379, 2597, 2193,
       1527, 3040, 2130,  884, 3382,  663, 1829,  293, 2429, 1826, 3231,
        649,  851,  144, 2983, 3533, 3302, 3547, 2343, 2472, 2473, 2474,
       2475, 2476, 2477, 2478, 2479, 2481])
for i in idx:
    j = 0
    print(f"i={i}, j={j}")
    print(f"nr.iloc[j, i]={nr.iloc[j, i]}")
    print(f"dr.iloc[j, i]={dr.iloc[j, i]}")
    print(f"dr.iloc[i, j]={dr.iloc[i, j]}")
    print(f"dr_v1[j, i]={dr_v1[j, i]}")
    print(f"dr_v1[i, j]={dr_v1[i, j]}")
    print(f"cardinality_matrix.iloc[i, j]={cardinality_matrix.iloc[i, j]}")
    print(f"S.iloc[i, j]={S.iloc[i, j]}")
    print(f"Sv1.iloc[i, j]={Sv1.iloc[i, j]}")
    

    

In [11]:
squared_normalized_rating_matrix = (normalized_rating_matrix * normalized_rating_matrix).sum(axis=1)
squared_normalized_rating_matrix = squared_normalized_rating_matrix.to_numpy()
squared_normalized_rating_matrix = np.sqrt(squared_normalized_rating_matrix)
dr_v1 = squared_normalized_rating_matrix[:, np.newaxis] * squared_normalized_rating_matrix[np.newaxis, :]
dr_v1

array([[1871.8910318 , 1058.19035618,  463.71273514, ...,  183.27013682,
         453.21412545,  670.21385121],
       [1058.19035618,  598.2008626 ,  262.13937458, ...,  103.60362225,
         256.20445244,  378.87559793],
       [ 463.71273514,  262.13937458,  114.87287298, ...,   45.40045065,
         112.27211314,  166.02819972],
       ...,
       [ 183.27013682,  103.60362225,   45.40045065, ...,   17.94332174,
          44.37256943,   65.61823425],
       [ 453.21412545,  256.20445244,  112.27211314, ...,   44.37256943,
         109.73023537,  162.26926636],
       [ 670.21385121,  378.87559793,  166.02819972, ...,   65.61823425,
         162.26926636,  239.96407842]])

In [None]:
Sv1 = (1 + nr/dr_v1)/2
Sv1[cardinality_matrix<3] = None

#-np.sort(-Sv1.to_numpy())[0, 1:40]

In [28]:
nr_copy = nr.copy()
#nr_copy[cardinality_matrix<3] = None

dr_copy = dr_v1.copy()
#dr_copy[cardinality_matrix<3] = None

Sv1 = (1 + nr_copy/dr_copy)/2

#S[S>=0.99] = None
np.fill_diagonal(Sv1.values, np.nan)
Sv1[cardinality_matrix<3] = None
#S[cardinality_matrix<3] = None

Sv1[Sv1.rank(axis=1, method='max', ascending=False)>30] = None

# cosine_distance = nr/dr
# S = (1 + cosine_distance)/2
# S
-np.sort(-Sv1.to_numpy())[0, 1:40]
np.argsort(-Sv1.to_numpy())[0, 1:40]

array([ 193, 1374, 2264,  191, 1640,  272, 3300,  192,  147, 2750, 1814,
       1610, 1025, 3233, 3308, 1861,  295, 2667, 3452, 3156, 3620, 3306,
         41,  100,  502, 1982, 2489, 1852, 3301, 2746, 2456, 2457, 2458,
       2459, 2460, 2461, 2462, 2463, 2464])

In [31]:
-np.sort(-Sv1.to_numpy())[0, 0:40]

array([0.75147949, 0.65539515, 0.65080828, 0.64379376, 0.64283326,
       0.64165849, 0.63965299, 0.63634907, 0.62895691, 0.62839951,
       0.62796197, 0.62578031, 0.62576515, 0.62456486, 0.62400165,
       0.62284069, 0.62114054, 0.62096599, 0.61813232, 0.617427  ,
       0.6161459 , 0.61598478, 0.61396232, 0.60967869, 0.60914057,
       0.60812021, 0.60700737, 0.60619923, 0.60613752, 0.60441589,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan])

In [64]:
#np.argsort(-Sv1.to_numpy())[0:10, 1:40]
#Sv1.rank(axis=1, ascending=False)
nr_copy.iloc[0, np.argsort(-S_v2.to_numpy())[0, 1:30]]
#nr_copy.iloc[np.argsort(-Sv1.to_numpy())[0:10, 1:40]]

m53        1.745781
m2487      0.052747
m2304      0.916612
m3880      3.028440
m755       2.526754
m3644      2.700837
m3293      3.520463
m2127      1.800423
m567       4.164278
m3292      4.180320
m3495      6.029996
m3114    893.960037
m2494     14.262462
m3905      6.934317
m3056      2.341297
m1901      2.213432
m669       2.491559
m1664      0.590179
m2776      5.094869
m129       3.267992
m3338     14.315405
m2773      3.158057
m525       6.792400
m1651      5.836284
m1872      0.899733
m1144      4.345282
m3853      2.849124
m831       6.173367
m59        4.445937
Name: m1, dtype: float64

In [16]:
idx = np.array(["m1", "m10", "m100", "m1510", "m260", "m3212"])
Sv1.loc[idx, idx]

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.505509,0.477509,,0.641658,
m10,0.505509,,0.508178,,0.516948,
m100,0.477509,0.508178,,,0.471287,
m1510,,,,,,
m260,0.641658,0.516948,0.471287,,,
m3212,,,,,,


In [27]:
#idx = np.array(["m1", "m10", "m100", "m1510", "m260", "m3212"])
Sv1.iloc[0, np.array([193, 1374])]
rating_matrix.iloc[:, np.array([0, 193, 1374])]

Unnamed: 0,m1,m1198,m2355
u1,5.0,,5.0
u10,5.0,5.0,4.0
u100,,4.0,
u1000,5.0,5.0,5.0
u1001,4.0,4.0,4.0
...,...,...,...
u995,,,
u996,4.0,5.0,3.0
u997,4.0,,
u998,,,


In [None]:
idx = np.array(["m1", "m10", "m100", "m1510", "m260", "m3212"])
Sv1.loc[idx, idx]

In [None]:
S[cardinality_matrix<3] = None
S

In [None]:
S_tmp = S.copy()
#(-S_tmp).to_numpy().sort(axis=1)
x = S_tmp.to_numpy()
-np.sort(-x, axis=1)

In [None]:
-np.sort(-x, axis=1)

In [None]:
np.argsort(-S[S.rank(axis=1, method='max', ascending=False) <= 30].to_numpy())[:, 0:5]

In [None]:
-np.sort(-S[S.rank(axis=1, method='max', ascending=False) <= 30].to_numpy())[:, 0:5]

In [None]:
S[S.rank(axis=1, method='max', ascending=False)>30] = None
S

In [None]:
-np.sort(-S.to_numpy())[:, 0:10]

In [None]:
print((~S.isna()).sum(axis=1))
#S.to_numpy().sort(axis=1)

In [57]:
def build_similarity_matrix():
    #rating_matrix = ratings.pivot_table(index="UserID", columns="MovieID", values="Rating")
    rating_matrix = pd.read_csv('Rmat.csv', sep=',')

    normalized_rating_matrix = rating_matrix.subtract(rating_matrix.mean(axis=1), axis='rows')

    cardinality_df = (~normalized_rating_matrix.isna()).astype('int')
    cardinality_df = cardinality_df.T
    cardinality_matrix = cardinality_df @ cardinality_df.T
    
    normalized_rating_matrix = normalized_rating_matrix.T
    normalized_rating_matrix = normalized_rating_matrix.fillna(0)

    nr = normalized_rating_matrix @ normalized_rating_matrix.T

    squared_normalized_rating_matrix = (normalized_rating_matrix * normalized_rating_matrix).sum(axis=1)
    squared_normalized_rating_matrix = squared_normalized_rating_matrix.to_numpy()
    squared_normalized_rating_matrix = np.sqrt(squared_normalized_rating_matrix)
    dr = squared_normalized_rating_matrix[:, np.newaxis] * squared_normalized_rating_matrix[np.newaxis, :]

    cosine_distance = nr/dr
    S = (1 + cosine_distance)/2
    
    S[cardinality_matrix<3] = None
    
    S[S.rank(axis=1, ascending=False)>30] = None
    return S

In [None]:
%% time
rating_matrix = pd.read_csv('Rmat.csv', sep=',')

normalized_rating_matrix = rating_matrix.subtract(rating_matrix.mean(axis=1), axis='rows')

cardinality_df = (~normalized_rating_matrix.isna()).astype('int')
cardinality_df = cardinality_df.T
cardinality_matrix = cardinality_df @ cardinality_df.T    

nr = normalized_rating_matrix @ normalized_rating_matrix.T

squared_normalized_rating_matrix = (normalized_rating_matrix * normalized_rating_matrix).sum(axis=1)
squared_normalized_rating_matrix = squared_normalized_rating_matrix.to_numpy()
squared_normalized_rating_matrix = np.sqrt(squared_normalized_rating_matrix)
dr_v1 = squared_normalized_rating_matrix[:, np.newaxis] * squared_normalized_rating_matrix[np.newaxis, :]


S_v1 = (1 + nr/dr_v1)/2

#S[S>=0.99] = None
np.fill_diagonal(S_v1.values, np.nan)
S_v1[cardinality_matrix<3] = None
#S[cardinality_matrix<3] = None

#S[S.rank(axis=1, method='max', ascending=False)>30] = None

# cosine_distance = nr/dr
# S = (1 + cosine_distance)/2
# S
# -np.sort(-Sv1.to_numpy())[0, 1:40]
# np.argsort(-Sv1.to_numpy())[0, 1:40]
S_v1

In [None]:
%%time

rating_matrix = pd.read_csv('Rmat.csv', sep=',')

normalized_rating_matrix = rating_matrix.subtract(rating_matrix.mean(axis=1), axis='rows')

cardinality_df = (~normalized_rating_matrix.isna()).astype('int')
cardinality_df = cardinality_df.T
cardinality_matrix = cardinality_df @ cardinality_df.T

normalized_rating_matrix = normalized_rating_matrix.T
normalized_rating_matrix = normalized_rating_matrix.fillna(0)

nr = normalized_rating_matrix @ normalized_rating_matrix.T

squared_normalized_rating_matrix = ((normalized_rating_matrix**2) @ (normalized_rating_matrix!=0).T)
squared_normalized_rating_matrix = squared_normalized_rating_matrix.apply(np.vectorize(np.sqrt))
dr = squared_normalized_rating_matrix * squared_normalized_rating_matrix.T

nr[cardinality_matrix<3] = None
dr[cardinality_matrix<3] = None

S = (1 + nr/dr)/2

np.fill_diagonal(S.values, np.nan)

S[cardinality_matrix<3] = None
S[S.rank(axis=1, method='max', ascending=False)>30] = None

S_v1 = S
S_v1

In [None]:
idx = np.array(["m1", "m10", "m100", "m1510", "m260", "m3212"])
S_v1.loc[idx, idx]

In [58]:
S_v1 = build_similarity_matrix()

In [61]:

S_v1

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,1.0,,,,,,,,,,...,,,,,,,,,,
m10,,1.0,,,,,,,,,...,,,,,,,,,,
m100,,,1.0,,,0.601094,,,,,...,,,,,0.574447,,,,,
m1000,,,,1.0,,,,,,,...,,,,,,,,,,
m1002,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,,,,,,,,,,,...,,,,,,1.0,,,,
m996,,,,,,,,,,,...,,,,,,,1.0,,,
m997,,,,,,,,,,,...,,,,,,,,1.0,,
m998,,,,,,,0.556721,,,,...,,,,0.563471,,,,,1.0,


In [59]:
def build_similarity_matrix_v2():
    #rating_matrix = ratings.pivot_table(index="UserID", columns="MovieID", values="Rating")
    rating_matrix = pd.read_csv('Rmat.csv', sep=',')

    normalized_rating_matrix = rating_matrix.subtract(rating_matrix.mean(axis=1), axis='rows')

    cardinality_df = (~normalized_rating_matrix.isna()).astype('int')
    cardinality_df = cardinality_df.T
    cardinality_matrix = cardinality_df @ cardinality_df.T
    
    normalized_rating_matrix = normalized_rating_matrix.T
    normalized_rating_matrix = normalized_rating_matrix.fillna(0)

    nr = normalized_rating_matrix @ normalized_rating_matrix.T
    print(nr)

    squared_normalized_rating_matrix = ((normalized_rating_matrix**2) @ (normalized_rating_matrix!=0).T)
    squared_normalized_rating_matrix = squared_normalized_rating_matrix.apply(np.vectorize(np.sqrt))
    dr = squared_normalized_rating_matrix * squared_normalized_rating_matrix.T
    print(dr)

    cosine_distance = nr/dr
    S = (1 + cosine_distance)/2
    print(S)
    
    S[cardinality_matrix<3] = None
    print(S)
    
    S[S.rank(axis=1, ascending=False)>30] = None
    print(S)
    return S

In [60]:
S_v2 = build_similarity_matrix_v2()

                m1         m10        m100      m1000     m1002      m1003  \
m1     1871.891032   11.659220  -20.858351   5.032585 -0.659048 -23.344176   
m10      11.659220  598.200863    4.287791  -0.151596 -0.230846   9.413530   
m100    -20.858351    4.287791  114.872873  -0.215273  0.000000  20.587791   
m1000     5.032585   -0.151596   -0.215273  27.458205  0.000000  -3.208358   
m1002    -0.659048   -0.230846    0.000000   0.000000  7.546158   0.000000   
...            ...         ...         ...        ...       ...        ...   
m994     83.717218   -7.177903  -19.882489  -2.582458  1.605851 -13.596448   
m996    -81.071540   14.060294    7.762697   3.228513  0.000000  14.656748   
m997      0.353780    1.261541   -3.082135   0.077676 -1.753780  -0.739329   
m998    -14.696421   -3.676269   -0.593002   4.044669  0.000000  -1.470914   
m999    -27.943623   19.038870    6.047802   5.311984  0.897822  -6.446042   

           m1004      m1005      m1006      m1007  ...       m9

             m1       m10      m100     m1000     m1002     m1003     m1004  \
m1     1.000000  0.512106  0.392000  0.729637  0.405249  0.344362  0.193479   
m10    0.512106  1.000000  0.547458  0.490472       NaN  0.610983  0.423742   
m100   0.392000  0.547458  1.000000  0.482965       NaN  0.836584  0.629538   
m1000  0.729637  0.490472  0.482965  1.000000       NaN  0.180765       NaN   
m1002  0.405249       NaN       NaN       NaN  1.000000       NaN       NaN   
...         ...       ...       ...       ...       ...       ...       ...   
m994   0.683828  0.454464  0.306743  0.226027  0.722766  0.251738  0.227186   
m996   0.290653  0.547504  0.629374  0.668436       NaN  0.790889  0.711965   
m997   0.514043  0.668733  0.269576       NaN       NaN  0.366023  0.932724   
m998   0.383772  0.448290  0.478923  0.725336       NaN  0.445008  0.843772   
m999   0.414505  0.600812  0.612815  0.680574       NaN  0.379650  0.440523   

          m1005     m1006     m1007  ...       m99 

In [62]:
S_v2

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,1.0,,,,,,,,,,...,,,,,,,,,,
m10,,1.0,,,,,,,,,...,,,,,,,,,,
m100,,,1.0,,,,,,,,...,,,,,,,,,,
m1000,,,,1.0,,,,,,,...,,,,,,,,,,
m1002,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,,,,,,,,,,,...,,,,,,1.0,,,,
m996,,,,,,,,,,,...,,,,,,,1.0,,,
m997,,,,,,,,,,,...,,,,,,,,1.0,,
m998,,,,,,,,,,,...,,,,,,,,,1.0,


In [None]:
def display_similarity_matrix(S, movieID):
    #S = S_copy.copy()
    movieID = "m" + str(movieID)
    movie_ratings = S.loc[movieID].dropna()
    movie_ratings = movie_ratings.sort_values(ascending=False)
    movie_ratings = movie_ratings[0:30]
    
    print(f"Similarity Matrix for Movie={movieID}")
    print("------------------------------------------------------------------")
    #print(movies[movies["MovieID"] == movieID])
    print(movies[movies["MovieID_idx"] == movieID])
    print("------------------------------------------------------------------")
    
    #movie_rating_details = movies.merge(movie_ratings, left_on="MovieID", right_on="MovieID", how="right")
    movie_rating_details = movies.merge(movie_ratings, left_on="MovieID_idx", right_on="MovieID", how="right")
    movie_rating_details.rename(columns={1 : "Rating"}, inplace=True)
    #movie_rating_details.drop(columns=['Genres'], inplace=True) 
    #movie_rating_details = movie_rating_details['MovieID', 'Rating']
    print(movie_rating_details.columns)
    print(movie_rating_details)
    #print(movie_ratings)
    #return movie_ratings

movies["MovieID_idx"] = "m" + movies["MovieID"].astype(str)
movies

S_v1.loc["m10"].dropna().sort_values(ascending=False)

In [None]:

display_similarity_matrix(S_v1, 1)


In [None]:
display_similarity_matrix(S_v2, 1)


#### myIBCF

In [None]:

def myIBCF():
    

In [None]:
S

In [None]:
S.iloc[0:10, 30:40]

In [None]:
for i in np.arange(0, 30):
    print(f"i={i}, {np.sum(np.isnan(np.sort(-S.iloc[:, :], axis=1)[:,i]))}")

In [None]:
np.sum((np.sort(-S, axis=1)[:,3700]) == np.nan)

In [None]:
np.argsort(-S, axis=1)

In [None]:
S_copy = S.copy()

In [None]:
S = S_copy.copy()

#S.to_numpy()[:, np.argsort(-S.to_numpy(), axis=1) >= 30] = None
np.argsort(-S.to_numpy(), axis=1) >= 30

#S.to_numpy()[:, 0] = None
S
#np.sum(np.argsort(-S.to_numpy(), axis=1) >= 30, axis=1)
#S[np.argsort(S, axis=1) < S.shape[0] - 30] = None

#S[np.argsort(S, axis=1) >= 50] = None

#np.sum(np.argsort(-S, axis=1) >= 30, axis=1)
#S[np.argsort(-S, axis=1) >= 30] = None
#np.sum(~S.isna(), axis=1)
#np.argsort(-S, axis=1)

#S.argsort()

#S.to_numpy()[np.argsort(-S) >= 30]
(np.argsort(-S) >= 30).shape

#S.to_numpy()[np.argsort(-S) >= 30]=None
#S

S[np.argsort(-S) >= 30] = None

In [None]:
(~S.isna()).sum(axis=1)

In [None]:
type(np.argsort(-S) >= 30)
type(cardinality_matrix)

In [None]:
#S = S_copy.copy()
#print((S.iloc[0]))

print(S.iloc[0:2][0:30])
print(-np.sort(-S.iloc[0])[0:30])

print(np.argsort(-S.iloc[0])[0:30])

print(S.iloc[0, np.argsort(-S.iloc[0])[0:30]])
print(S.to_numpy()[0, np.argsort(-S.iloc[0]) > 30])

S.to_numpy()[0:2, np.argsort(-S.iloc[0:2]) >= 30] = None

#np.argsort(-S.iloc[0]) > 30
#np.take_along_axis(S.iloc[0], np.argsort(-S.iloc[0])[0:30], axis=0)

#S.iloc[0, 2816]
#np.sum(np.argsort(S.iloc[0, :]))

In [None]:
np.sum(~np.isnan(S.to_numpy()[0]))

In [None]:
S[np.argsort(-S, axis=1) >= 30] = None

In [None]:
#(S!=0).sum(axis=1)

tmp = np.sort(-S)
tmp[:, 20]

In [None]:
S = S_copy.copy()
S = S.iloc[0:10, 0:10]
#np.sum(~S.isna())

S

In [None]:
np.sort(-S, axis=1)

In [None]:
np.argsort(-S, axis=1)

In [None]:
S.sum(axis=1)
#np.argsort(S, axis=1)

(~S.isna()).sum(axis=1)
#S[np.argsort(S.iloc[0:10, 0:10], axis=0) >= 5] = None
S[np.argsort(S, axis=1) >= 5] = None
#np.argsort(S.iloc[0:10, 0:10], axis=0) >= 5
#S.shape
S

In [None]:
rating_matrix = ratings.pivot_table(index="userID", columns="itemID", values="rating")

rating_matrix = rating_matrix.iloc[0:10, 0:15]
#(rating_matrix - rating_matrix.mean(axis=1)).sum(axis=1)

print(rating_matrix)

print(rating_matrix.mean(axis=1))

print(rating_matrix.subtract(rating_matrix.mean(axis=1), axis='rows'))
print(rating_matrix.subtract(rating_matrix.mean(axis=1), axis='rows').sum(axis=1))
#np.array([1, 2, 3, 4, 5, 6, 7, 8])

#print(rating_matrix.subtract(np.arange(0, 10), axis='rows'))

In [None]:
np.sum(rating_matrix.count(axis=1))
(rating_matrix.count())

In [None]:
data = {
   "value": range(12),
   "variable": ["A"] * 3 + ["B"] * 3 + ["C"] * 3 + ["D"] * 3,
   "date": pd.to_datetime(["2020-01-03", "2020-01-04", "2020-01-05"] * 4)
}


df = pd.DataFrame(data)
df

In [None]:
df.pivot(index="date", columns="variable", values="value")

In [None]:
df.pivot(index="date", columns="variable", values="value").reset_index()