# User-User Collaborative Filtering in Recommendation System

# Libraries

In [2]:
import os
import math
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
from keras import backend as K

# 

# Reading the datasets

In [3]:
dataset = pd.read_csv('data/Dataset.csv')
movie_id_mapping = pd.read_csv('data/Movie_Id_Titles.csv')

users = sorted(dataset['user_id'].unique())
movies_len = dataset['item_id'].nunique()
_matrix = {}

for user in users: _matrix[user] = [None]*movies_len

# 

# Creating the user movie rating matrix
Each row will represent a distinct user and each column will represent a distinct movie. The rating given by the user for a particular movie will be the value of the matrix. If the rating for a specific movie by a specific user is unavailable, the value will be None or null. This matrix can be saved for improving the recommendation engine's performance and efficiency.

In [4]:
users = sorted(dataset['user_id'].unique())
movies_len = dataset['item_id'].nunique()
_matrix = {}

for user in users: 
    _matrix[user] = [None]*movies_len
    
for user in users:
    _movies = dataset[(dataset['user_id']==user)][['item_id','rating']]
    for i in _movies['item_id']: _matrix[user][i-1] = list(_movies[_movies['item_id']==i]['rating'])[0]
        
pd.DataFrame([_matrix[user] for user in users],users).to_csv('matrix.csv',index=False)

del _matrix

matrix = pd.read_csv('./matrix.csv')
matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


# 

# Normalizing the user movie matrix values
The rating value will be normalized by subtracting the mean value given by a user for all the movies he/she has ever rated. The null values will also be filled using the same mean that we subtracted.

In [5]:
def normalize_and_fillna(x):
    _mean = x.mean()
    x = x-_mean
    x = x.fillna(_mean)
    return x
    
matrix = matrix.apply(normalize_and_fillna, axis=1)

# 

# Calculating similarity between 2 vectors using cosine
Creating a function to calculate the degree between 2 vectors using cosine. If smaller the degree, higher the similarity.

cos(theta) = (a . b) / |a|.|b|

In [6]:
def calc_cos_data(a,b,print_data=False):
    mod_a = sum([i**2 for i in a])**0.5
    mod_b = sum([i**2 for i in b])**0.5

    numerator = sum([a[i]*b[i] for i in range(2)])
    denom = mod_a*mod_b

    cos_theta = numerator/denom
    theta = np.arccos(cos_theta)
    degree = math.degrees(theta)
    
    if print_data:
        print('cos theta: ' + str(round(cos_theta,5)))
        print('cosine similarity: '+str(round(1-cos_theta,5)))
        print('theta: ' + str(theta) +' rad')
        print('degree: ' + str(degree) +' degree')
    else: return degree

# 

# Calculating the similarity matrix between users
Each row and column will represent a distinct user and the degree of similarity will be the value present in the matrix. The smaller the degree, higher the similarity. This matrix can be saved for improving the recommendation engine's performance and efficiency.

In [7]:
similarity_matrix = []

for user1 in users:
    vals = [calc_cos_data(matrix.iloc[user1],matrix.iloc[user2]) for user2 in users]
    similarity_matrix.append(vals)

np.array(similarity_matrix).shape

pd.DataFrame(similarity_matrix).to_csv('user_user_similarity_matrix.csv', index=False)

similar_users_matrix = pd.read_csv('user_user_similarity_matrix.csv')

# 

# Making the recommendations
To recommend movies for a user "u", we will do the following sequentially.

Find n number of similar users using the similarity matrix [smaller the value, higher the similarity].
Find movies that have been rated greater than or equal to a specific rating value "v" using the user-movie rating matrix.
Find the common movies in the whole set, which will be the movies to recommend.
Remove the movies which the user "u" has already watched [the user has watched the movie if he has rated it] and recommend them.

In [198]:
number_of_similar_users = 5
idx = 1
values = similar_users_matrix.iloc[idx]

similar_users = set(sorted(range(len(values)), key=lambda i: values[i])[:number_of_similar_users])
similar_users.discard(idx)
similar_users = list(similar_users)

print(f'similar users for user {idx}: {similar_users}')

matrix = pd.read_csv('./matrix.csv')

vals = [list(matrix.iloc[i].values) for i in similar_users]

good_ratings = {}

for j in range(5):
    user = similar_users[j]
    i = vals[j]
    v = [index for index in range(len(i)) if i[index]==5]
    good_ratings[user] = set(v)

common_movies = set()

for i in range(5):
    if len(common_movies) == 0: common_movies = common_movies.union(good_ratings[similar_users[i]])
    else: common_movies = common_movies.intersection((good_ratings[similar_users[i]]))

unseen_movies = []
_temp = list(matrix.iloc[idx].isnull())

for i in range(len(_temp)):
    if _temp[i]==True: unseen_movies.append(i)

unseen_movies = set(unseen_movies)

common_movies = common_movies.intersection(unseen_movies)
recommendations = pd.DataFrame({'item_id':list(common_movies)})
recommendations

similar users for user 1: [774, 551, 466, 405, 22]


Unnamed: 0,item_id
0,402


# 

# Creating a general function to perform the recommendation
You can generate the recommendations by simply calling the recommend function by passing in the following:

user_index: the user for whom we have to make the recommendation
number_of_similar_users: the number of similar users to use for making the recommendation
min_rating: the minimum rating required for a movie to be consider as liked by a user

In [200]:
#  function to find n similar users for user index using the similar users matrix file
def find_similar_users(user_index, number_of_similar_users=5):
    idx = user_index
    values = similar_users_matrix.iloc[idx]

    similar_users = set(sorted(range(len(values)), key=lambda i: values[i])[:number_of_similar_users])
    similar_users.discard(idx)
    similar_users = list(similar_users)
    return similar_users


#  find top rated movies by users
def find_good_rated_movies(users, min_rating=5):
    vals = [list(matrix.iloc[i].values) for i in users]
    good_ratings = {}

    for j in range(len(users)):
        user = users[j]
        i = vals[j]
        v = [index for index in range(len(i)) if i[index]>=min_rating]
        good_ratings[user] = set(v)
    return good_ratings

def find_common_movies_list(user_index, similar_users, good_ratings):
    idx = user_index
    common_movies = set()

    for i in range(len(similar_users)):
        if len(common_movies) == 0: common_movies = common_movies.union(good_ratings[similar_users[i]])
        else: common_movies = common_movies.intersection((good_ratings[similar_users[i]]))

    unseen_movies = []
    _temp = list(matrix.iloc[idx].isnull())

    for i in range(len(_temp)):
        if _temp[i]==True: unseen_movies.append(i)

    unseen_movies = set(unseen_movies)

    common_movies = common_movies.intersection(unseen_movies)
    recommendations = pd.DataFrame({'item_id':list(common_movies)})
    recommendations = recommendations.merge(movie_id_mapping,how='left',on='item_id')
    return recommendations

def recommend(user_index, number_of_similar_users=5, minimum_good_rating=5):
    print(f'user: {user_index}')
    similar_users = find_similar_users(user_index,number_of_similar_users)
    print(f'similar users: {similar_users}')
    good_ratings = find_good_rated_movies(similar_users,minimum_good_rating)
    recommendations = find_common_movies_list(user_index,similar_users,good_ratings)
    return recommendations

# 

# Predictions!

In [220]:
user_idx = 276
number_of_similar_users = 10
min_rating = 4

data = recommend(user_idx,number_of_similar_users,min_rating)
data.head(10)

user: 276
similar users: [90, 7, 328, 524, 269, 334, 181, 758, 85, 474]


Unnamed: 0,item_id,title
0,480,North by Northwest (1959)
1,513,"Third Man, The (1949)"
2,482,Some Like It Hot (1959)
3,133,Gone with the Wind (1939)
4,198,Nikita (La Femme Nikita) (1990)
5,169,"Wrong Trousers, The (1993)"
6,653,Touch of Evil (1958)
7,526,Ben-Hur (1959)
8,508,"People vs. Larry Flynt, The (1996)"
9,656,M (1931)


# 

# 

# Item-based Collaborative Filtering in Recommendation System

# Importing Libraries & Datasets

In [153]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

rating = pd.read_csv("data/Dataset.csv")
movie = pd.read_csv("data/Movie_Id_Titles.csv")
df = pd.merge(movie, rating, on='item_id')

# Average Rating and Number of Ratings + Pivot Table!
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings['rating_numbers'] = pd.DataFrame(df.groupby('title')['rating'].count())
movieRate = df.pivot_table(index='user_id', columns='title', values='rating')
movieRate.fillna(0, inplace=True)

# Recommendation system
def recommendMovies(name , min_rating_count = 50):
    user_rating = movieRate[name]
    similar_movies = movieRate.corrwith(user_rating)
    corr_movies = pd.DataFrame(similar_movies, columns=['Correlation'])
    corr_movies.dropna(inplace=True)
    corr_movies = corr_movies.join(ratings['rating_numbers'], how='left', lsuffix='_left', rsuffix='_right')
    final = corr_movies[corr_movies['rating_numbers']>min_rating_count].sort_values('Correlation', ascending=False)
    dfff = final.merge(df,how='left',on='title')
    dfff = dfff.groupby(['title', 'Correlation', 'rating_numbers', 'item_id']).size().to_frame()[0].to_frame().reset_index()
    dfff = dfff[['title','Correlation','rating_numbers','item_id']].sort_values(by=['Correlation'], ascending=False)
    return dfff.head(20)

# 

# `FINAL` 

# Merging IB-CF & GCN

In [148]:
def both(fav_movie):
    df = recommendMovies(fav_movie)
    ser1 = df['title']
    ser2 = df['item_id']

    def predict(user_id):
        prediction = []
        for x in (ser2):
            a = model.predict([(user_id, x)])
            prediction.append(a)
        return prediction

    top = predict(0)

    # combine the Series and list into a DataFrame
    df = pd.DataFrame({'Title': ser1, 'Your Predicted Rating': top})
    df = df.reset_index(drop=True)
    df = df.sort_values(by=['Your Predicted Rating'], ascending=False)
    return df

# 

In [156]:
both('Game, The (1997)')



Unnamed: 0,Title,Your Predicted Rating
16,Good Will Hunting (1997),[[4.1326284]]
10,L.A. Confidential (1997),[[4.060441]]
12,Titanic (1997),[[4.035684]]
11,Contact (1997),[[3.646411]]
5,Air Force One (1997),[[3.511013]]
0,"Game, The (1997)",[[3.46806]]
4,"Devil's Advocate, The (1997)",[[3.3981512]]
3,"Edge, The (1997)",[[3.3469694]]
6,Kiss the Girls (1997),[[3.3126547]]
14,Cop Land (1997),[[3.3033464]]


# 

# Thank you!