# User_ratings_based Model Algorithm

The model compares a targeted user from the database, DB, (two csv tables) to a randomly_picked group of users, then is choosing the most similar one, in terms of his ratings to a similar movies. 

The algorithm is picking a different group of users from the DB every time (in case user is not happy with the results).

***DB is long, takes about 1 min to get prediction for a new targeted user

In [1]:
# Dependencies
# import matplotlib.pyplot as plt
# %matplotlib inline

import os
import pandas as pd
import numpy as np
import math
#import tensorflow as tf
from math import pow, sqrt
#from sklearn.model_selection import train_test_split

#os.environ['KMP_DUPLICATE_LIB_OK']='True'

#from tensorflow import keras
import scipy.stats
import scipy.spatial
import scipy.stats as st
#from sklearn.model_selection import KFold
import random
from sklearn.metrics import mean_squared_error


In [2]:
# Reading movies dataset into a pandas dataframe object.
movies = pd.read_csv("data/ml-latest/movies.csv", low_memory=False, encoding='utf-8')
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Reading ratings dataset into a pandas dataframe object.
ratings = pd.read_csv("data/ml-latest/ratings.csv", low_memory=False, encoding='utf-8')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [4]:
# Getting number of users and movies from the dataset.
user_ids = ratings.userId.unique().tolist()
movie_ids = ratings.movieId.unique().tolist()
#tag_ids = tags.movieId.unique().tolist()
print('Number of Uniq-Users: {}'.format(len(user_ids)))
print('Number of Movies: {}'.format(len(movie_ids)))
#print('Number of Movies Tags: {}'.format(len(tag_ids)))

Number of Uniq-Users: 283228
Number of Movies: 53889


In [5]:
ratings['userId'].astype('int')

0                1
1                1
2                1
3                1
4                1
             ...  
27753439    283228
27753440    283228
27753441    283228
27753442    283228
27753443    283228
Name: userId, Length: 27753444, dtype: int64

In [6]:
ratings = ratings.sort_values(["userId"], ascending=False)


In [7]:
del ratings['timestamp']
ratings.head()

Unnamed: 0,userId,movieId,rating
27753443,283228,54286,4.5
27753358,283228,942,4.5
27753360,283228,947,5.0
27753361,283228,950,5.0
27753362,283228,955,5.0


In [8]:
ratings_short = ratings[:1000000]

print(len(ratings_short))

1000000


In [9]:
ratings_short.head()

Unnamed: 0,userId,movieId,rating
27753443,283228,54286,4.5
27753358,283228,942,4.5
27753360,283228,947,5.0
27753361,283228,950,5.0
27753362,283228,955,5.0


In [10]:
min_uid = min(ratings_short['userId'])
max_uid = max(ratings_short['userId'])           
print(min_uid, max_uid)

273134 283228


In [11]:
print(len(ratings_short['userId'].unique().tolist()), len(ratings_short['userId']))

10095 1000000


In [12]:
ratings_short.to_csv("output/ratings_short.csv")

In [125]:
ratings_short = pd.read_csv("output/ratings_short.csv")
del ratings_short["Unnamed: 0"]
ratings_short.head()

Unnamed: 0,userId,movieId,rating
0,283228,54286,4.5
1,283228,942,4.5
2,283228,947,5.0
3,283228,950,5.0
4,283228,955,5.0


In [16]:
# Getting number of users and movies from the dataset.
user_ids = ratings_short.userId.unique().tolist()
movie_ids = ratings_short.movieId.unique().tolist()
#tag_ids = tags.movieId.unique().tolist()
print('Number of Uniq-Users: {}'.format(len(user_ids)))
print('Number of Movies: {}'.format(len(movie_ids)))
#print('Number of Movies Tags: {}'.format(len(tag_ids)))


Number of Uniq-Users: 10095
Number of Movies: 21285


In [6]:
# Separting movie title and year part using split function.
split_values = movies['title'].str.split("(", n = 1, expand = True)
split_values.head()

Unnamed: 0,0,1
0,Toy Story,1995)
1,Jumanji,1995)
2,Grumpier Old Men,1995)
3,Waiting to Exhale,1995)
4,Father of the Bride Part II,1995)


In [7]:
# setting 'title' values to title part.
movies.title = split_values[0]

In [8]:
# creating 'release_year' column.
movies['release_year'] = split_values[1]


In [9]:
# Cleaning the release_year series.
movies['release_year'] = movies.release_year.str.replace(')','')


In [10]:
movies.head()


Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [11]:
splited_years = movies['release_year'].str.split("(", n = 1, expand = True)
#splited_years.fillna(0)
splited_years.head()


Unnamed: 0,0,1
0,1995,
1,1995,
2,1995,
3,1995,
4,1995,


In [12]:
splited_years[1] = splited_years[1].replace('nan', np.nan).fillna(0)
splited_years.head()


Unnamed: 0,0,1
0,1995,0
1,1995,0
2,1995,0
3,1995,0
4,1995,0


In [13]:
splited_years[1][29]


'1995'

In [14]:
splited_years[0][29]


'Yao a yao yao dao waipo qiao '

In [15]:
print(splited_years[1][27])


0


In [16]:
print(splited_years[0][27])


1995


In [17]:
for i in range(len(splited_years)):
    if (splited_years[1][i] !=0):
        splited_years[0][i]=splited_years[1][i]


In [18]:
splited_years[0][29]


'1995'

In [20]:
movies['release_year_cleaned'] = splited_years[0]
movies.head()


Unnamed: 0,movieId,title,genres,release_year,release_year_cleaned
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995,1995
2,3,Grumpier Old Men,Comedy|Romance,1995,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,1995
4,5,Father of the Bride Part II,Comedy,1995,1995


In [21]:
del movies['release_year']
#movies.head(100)


In [22]:
#movies.to_csv("output/movies_encoded_v3.csv")


In [23]:
movies = pd.read_csv("output/movies_encoded_v3.csv")
del movies['Unnamed: 0']
movies.head()


Unnamed: 0,movieId,title,genres,release_year_cleaned
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [17]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
# Getting the rating given by a user to a movie.
def get_rating_(userid, movieid):
    return (ratings_short.loc[(ratings_short.userId==userid) & (ratings_short.movieId == movieid),'rating'].iloc[0])


In [19]:
# Getting the list of all movie ids the specified user has rated.
def get_movieids_(userid):
    return (ratings_short.loc[(ratings_short.userId==userid),'movieId'].tolist())


In [20]:
# Getting the movie titles against the movie id.
def get_movie_title_(movieid):
    return (movies.loc[(movies.movieId == movieid),'title'].iloc[0])


In [21]:
def distance_similarity_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity        score is to be calculated.
    '''
    # Count of movies watched by both the users.
    both_watch_count = 0
    for element in ratings_short.loc[ratings_short.userId==user1,'movieId'].tolist():
        if element in ratings_short.loc[ratings_short.userId==user2,'movieId'].tolist():
            both_watch_count += 1
    if both_watch_count == 0 :
        return 0
    
    # Calculating distance based similarity between both the users.
    distance = []
    for element in ratings_short.loc[ratings_short.userId==user1,'movieId'].tolist():
        if element in ratings_short.loc[ratings_short.userId==user2,'movieId'].tolist():
            rating1 = get_rating_(user1,element)
            rating2 = get_rating_(user2,element)
            distance.append(pow(rating1 - rating2, 2))
    total_distance = sum(distance)
    
    # Adding one to the denominator to avoid divide by zero error.
    return 1/(1+sqrt(total_distance))



In [301]:
#print('Distance based similarity between user ids 101 & 3107: {} '.format(distance_similarity_score(101,3107)))


Distance based similarity between user ids 101 & 3107: 0.11787845042814334 


In [22]:
def pearson_correlation_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity score is to be calculated.
    '''
    # A list of movies watched by both the users.
    both_watch_count = []
    
    # Finding movies watched by both the users.
    for element in ratings_short.loc[ratings_short.userId==user1,'movieId'].tolist():
        if element in ratings_short.loc[ratings_short.userId==user2,'movieId'].tolist():
            both_watch_count.append(element)
    
    # Returning '0' correlation for bo common movies.
    if len(both_watch_count) == 0 :
        return 0
    
    # Calculating Co-Variances.
    rating_sum_1 = sum([get_rating_(user1,element) for element in both_watch_count])
    rating_sum_2 = sum([get_rating_(user2,element) for element in both_watch_count])
    rating_squared_sum_1 = sum([pow(get_rating_(user1,element),2) for element in both_watch_count])
    rating_squared_sum_2 = sum([pow(get_rating_(user2,element),2) for element in both_watch_count])
    product_sum_rating = sum([get_rating_(user1,element) * get_rating_(user2,element) for element in both_watch_count])
    
    # Returning pearson correlation between both the users.
    numerator = product_sum_rating - ((rating_sum_1 * rating_sum_2) / len(both_watch_count))
    denominator = sqrt((rating_squared_sum_1 - pow(rating_sum_1,2) / len(both_watch_count)) * (rating_squared_sum_2 - pow(rating_sum_2,2) / len(both_watch_count)))
    
    # Handling 'Divide by Zero' error.
    if denominator == 0:
        return 0
    return numerator/denominator


In [303]:
#print('Pearson Corelation between user ids 101 & 3107: {}'.format(pearson_correlation_score(101,3107)))


Pearson Corelation between user ids 101 & 3107: -0.11417114183449117


In [91]:
def most_similar_users_(user1, number_of_users, index, metric='pearson'):
    '''
    user1 : Targeted User
    number_of_users : number of most similar users you want to user1.
    metric : metric to be used to calculate inter-user similarity score. ('pearson' or else)
    '''
    # Getting distinct user ids.
    user_ids = ratings_short.userId.unique().tolist()
    
    sid=index
    eid=index+10
    
    print(sid, eid)
    
    # Getting similarity score between targeted and every other suer in the list(or subset of the list).
    if(metric == 'pearson'):
        similarity_score = [(pearson_correlation_score(user1,nth_user),nth_user) for nth_user in user_ids[sid:eid] if nth_user != user1]
    else:
        similarity_score = [(distance_similarity_score(user1,nth_user),nth_user) for nth_user in user_ids[sid:eid] if nth_user != user1]
    
    # Sorting in descending order.
    similarity_score.sort()
    similarity_score.reverse()
    
    # Returning the top most 'number_of_users' similar users. 
    return similarity_score[:number_of_users]


In [92]:
print(most_similar_users_(283228, 2, 283133))

283133 283143
[]


In [36]:
sim_users_ids[0][1]


107

In [37]:
sindex = sim_users_ids[0][1]
print(sindex)


107


In [93]:
def get_most_similar_users_(userid, index, user_ids):
#    user_ids = ratings.userId.unique().tolist()
    aint = []
    for a in user_ids:
        aint.append(int(a))
        
    min_uid = min(aint)
    
    ratings_suser_ids = []
    llength = min_uid + len(user_ids)
#    min_uid = min(user_ids)
    print("starting user random index : ", min_uid, llength)
    
    sin = random.randint(0, llength-10)
    print("picked: ", sin)
#    user_ids = ratings.userId.unique().tolist()[sin:sin+index]
    
    for i in range(sin,sin+index): 
#        print("i between sin and sin+index ", i)
        
        print(f"at index {i} range between {sin} and {sin+index}")
        
        sim_users_ids = most_similar_users_(userid, 2, i)
        
        score = sim_users_ids[0][0]
        sindex = sim_users_ids[0][1]
        print("sim_score and it's user_id ", score, sindex)
        
        ratings_suser_ids.append(sindex)
        
#        ratings_suser_ids.append(user_ids[sindex]-1)
    
    return ratings_suser_ids


In [94]:
sin =277336
index=1*10
range(sin,sin+index,10)

range(277336, 277346, 10)

In [95]:
for i in range(sin,sin+index):
    print(i)

277336
277337
277338
277339
277340
277341
277342
277343
277344
277345


In [40]:
reccom_users = []
reccom_movies = []

In [96]:
def get_recommendation_(userid, steps):
    
    user_ids = ratings_short.userId.unique().tolist()
#    new_user_ids = df.userId.unique().tolist()
 
    user_list = get_most_similar_users_(userid, steps*10, user_ids)
        
    total = {}
    similariy_sum = {}
    
    for user in user_list:
        
        # not comparing the user to itself (obviously!)
        if user == userid:
            continue
        
        # Getting similarity score between the users.
        score = pearson_correlation_score(userid,user)
        print("main_function score ", score)
        
        # not considering users having zero or less similarity score.
        if score <= 0:
            continue
        
        # Getting weighted similarity score and sum of similarities between both the users.
        for movieid in get_movieids_(user):
            # Only considering not watched/rated movies
            if movieid not in get_movieids_(userid) or get_rating_(userid,movieid) == 0:
       #         print("entered 'if' under main_ ")
                total[movieid] = 0
                total[movieid] += get_rating_(user,movieid) * score
                similariy_sum[movieid] = 0
                similariy_sum[movieid] += score
    
    # Normalizing ratings
    ranking = [(tot/similariy_sum[movieid],movieid) for movieid,tot in total.items()]
    ranking.sort()
    ranking.reverse()
    
    # Getting movie titles against the movie ids.
   
    recommendations = [get_movie_title_(movieid) for score,movieid in ranking]
#     if len(reccomendaions) ==0:
#          print("try one more time (or rate more movies)")
#     else:
#         print("try one more time (or rate more movies)")
    return recommendations[:10]


In [42]:
# Getting the movie id against the movie title.
def get_movie_id_(movietitle):
    return (movies.loc[(movies.title == movietitle),'movieId'].iloc[0])

In [43]:
new_movieId = get_movie_id_('Indiana Jones and the Last Crusade (1989)')
print("new id is: ", new_movieId)

new id is:  1291


In [49]:
movie_list_ids = ratings_short.movieId.unique().tolist()
movie_list =[]
for movie_id in movie_list_ids:
    movie_list.append(get_movie_title_(movie_id))
    

In [57]:
l = len(movie_list)    
print(len(movie_list), movie_list[l-67], movie_list[l-78], movie_list[l-22])

21285 I Killed John Lennon (2005) Helter Skelter (2012) Almighty Thor (2011)


In [106]:
num_of_movies = int(input("Enter the unumber of movies you're rating "))

new_movieId = []
new_ratings = []
n = num_of_movies
# print(n, type(n))

for i in range(n):
    movie_title = input("Enter the movie name: ")
    new_rating=float(input("Enter it's rating: "))
    
    new_movieId.append(get_movie_id_(movie_title))
    new_ratings.append(new_rating)
    

Enter the unumber of movies you're rating 3
Enter the movie name: Almighty Thor (2011)
Enter it's rating: 2.8
Enter the movie name: Helter Skelter (2012)
Enter it's rating: 3.7
Enter the movie name: I Killed John Lennon (2005)
Enter it's rating: 4.5


In [118]:
print(new_movieId)


[88349, 140239, 181437]


In [119]:
print(new_ratings)

[2.8, 3.7, 4.5]


In [109]:
movie_list = new_movieId
ratings_list = new_ratings

In [114]:
# new_df = pd.DataFrame()

In [115]:
last_user_id = ratings_short.userId[0]
last_user_id

0    283228
0    283229
Name: userId, dtype: int64

In [116]:
new_user_id = ratings_short.userId[0] +1
userin= new_user_id
print(userin)

0    283229
0    283230
Name: userId, dtype: int64


In [124]:
ratings_short.head()

Unnamed: 0,userId,movieId,rating
0,283228,54286,4.5
1,283228,942,4.5
2,283228,947,5.0
3,283228,950,5.0
4,283228,955,5.0


In [127]:
userin = ratings_short["userId"][0]+1
userin

283229

In [128]:
new_df = pd.DataFrame({"userId":userin, "movieId":new_movieId, "rating": new_ratings})
new_df.tail()

Unnamed: 0,userId,movieId,rating
0,283229,88349,2.8
1,283229,140239,3.7
2,283229,181437,4.5


In [129]:
ratings_short = ratings_short.append(new_df)
ratings_short.tail()

Unnamed: 0,userId,movieId,rating
999998,273134,3753,2.0
999999,273134,3948,1.0
0,283229,88349,2.8
1,283229,140239,3.7
2,283229,181437,4.5


In [130]:
len(ratings_short)

1000003

In [131]:
len(ratings_short.userId.unique().tolist())

10096

In [132]:
print(userin)


283229


In [97]:
#userin = int(input("Enter your user id (printed on the 'left') for a reccomendation: "))
recc_list = get_recommendation_(userin, 1)

#print(f"Found reccomendations for user {userin} with 1 set at random start: {recc_list}")

if len(recc_list) ==0:
    print("try one more time (or rate more movies)")
else:
    print(f"Found reccomendations for user {userin} with 1 set at random start: {recc_list}")
    

starting user random index :  273134 283230
picked:  175587
at index 175587 range between 175587 and 175597
175587 175597


IndexError: list index out of range

In [87]:
 print(most_similar_users_(userin, 2, 283133))

283133 283143
[]


In [88]:
sim_users_ids = most_similar_users_(userin, 2, 283133)
        
print(sim_users_ids[0][0]) 
print(sim_users_ids[0][1])

283133 283143


IndexError: list index out of range

In [35]:

# reccom_users.append(userin)

# for rec in recc_list:
#     reccom_movies.append(rec)
    

In [36]:
# print(reccom_users)
# print("--"*20)
# print(reccom_movies)

[8, 8, 4598]
----------------------------------------
['Alien³ (a.k.a. Alien 3) (1992)', 'Shining, The (1980)', 'Monty Python and the Holy Grail (1975)', "Monty Python's Life of Brian (1979)", 'Silence of the Lambs, The (1991)', 'Dances with Wolves (1990)', 'Mask, The (1994)', 'Once Were Warriors (1994)', 'Rob Roy (1995)', 'Braveheart (1995)', 'Alien³ (a.k.a. Alien 3) (1992)', 'Shining, The (1980)', 'Monty Python and the Holy Grail (1975)', "Monty Python's Life of Brian (1979)", 'Silence of the Lambs, The (1991)', 'Dances with Wolves (1990)', 'Mask, The (1994)', 'Once Were Warriors (1994)', 'Rob Roy (1995)', 'Braveheart (1995)', 'Dunkirk (2017)', 'Arrival (2016)', 'Casino Royale (2006)', 'Incredibles, The (2004)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode IV - A New Hope (1977)', 'Batman Begins (2005)', 'Indiana Jones 

In [46]:
# Getting the list of all user_ids the specified movie has been rated by.
def get_userids_(movieid):
    return (ratings.loc[(ratings.movieId==movieid),'userId'].tolist())


In [47]:
# Getting the movie title with specified movieid.
def get_movietitle_(movieid):
    return (movies.loc[(movies.movieId==movieid),'title'].iloc[0])
