# User_ratings_based Model Algorithm

The model compares a targeted user from the database, DB, (two csv tables) to a randomly_picked group of users, then is choosing the most similar one, in terms of his ratings to a similar movies. 

The algorithm is picking a different group of users from the DB every time (in case user is not happy with the results).

***DB is long, takes about 1 min to get prediction for a new targeted user

In [1]:
# Dependencies

import os
import pandas as pd
import numpy as np
import math
from math import pow, sqrt

import scipy.stats
import scipy.spatial
import scipy.stats as st
import random
from sklearn.metrics import mean_squared_error


In [148]:
# Reading movies dataset into a pandas dataframe object.
movies = pd.read_csv("data/ml-latest/movies.csv", low_memory=False, encoding='latin-1')
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [149]:
len(movies)

58098

In [150]:
len(movies['title'].unique())

58020

In [151]:
movies_clean = pd.read_csv("data/ml-latest/tk_movies.csv", low_memory=False, encoding='latin-1')
movies_clean.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,year,genre,country,director,writer,production_company,actors,description,avg_vote,keywords,title_year,combined_features,img_url
0,0,0,Hypocrites,1915,Drama,USA,LoisWeber,LoisWeber,Hobart Bosworth Productions,CourtenayFoote MyrtleStedman HerbertStanding A...,The parallel stories of a modern preacher and ...,6.6,parallel stories modern preacher medieval monk...,Hypocrites (1915),Hypocrites LoisWeber LoisWeber 1915 Drama Dra...,https://m.media-amazon.com/images/M/MV5BMjA5NT...
1,1,1,Tillie's Punctured Romance,1914,Comedy,USA,MackSennett CharlesBennett,,Keystone Film Company,MarieDressler CharlesChaplin MabelNormand Mack...,A con man from the city dupes a wealthy countr...,6.3,con man city dupes wealthy country girl marriage.,Tillie's Punctured Romance (1914),Tillie's Punctured Romance MackSennett Charles...,https://m.media-amazon.com/images/M/MV5BMTc5MT...
2,2,2,The Cheat,1915,"Drama, Romance",USA,CecilB.DeMille,HectorTurnbull JeanieMacpherson,Jesse L. Lasky Feature Play Company,FannieWard SessueHayakawa JackDean JamesNeill ...,"A venal, spoiled stockbroker's wife impulsivel...",6.6,"venal, spoiled stockbroker's wife impulsively ...",The Cheat (1915),The Cheat CecilB.DeMille CecilB.DeMille 1915 ...,https://m.media-amazon.com/images/M/MV5BMjEzMj...
3,3,3,"20,000 Leagues Under the Sea",1916,"Action, Adventure, Sci-Fi",USA,StuartPaton,JulesVerne,Williamson Submarine Film Corporation,DanHanlon EdnaPendleton CurtisBenton AllenHolu...,A French professor and his daughter accompany ...,6.2,French professor daughter accompany Captain Ne...,"20,000 Leagues Under the Sea (1916)","20,000 Leagues Under the Sea StuartPaton Stuar...",https://m.media-amazon.com/images/M/MV5BODYxOT...
4,4,4,Shoulder Arms,1918,"Comedy, War",USA,CharlesChaplin,CharlesChaplin,Charles Chaplin Productions,EdnaPurviance CharlesChaplin SydChaplin LoyalU...,Charlie is a boot camp private who has a dream...,7.3,Charlie boot camp private dream hero goes dari...,Shoulder Arms (1918),Shoulder Arms CharlesChaplin CharlesChaplin 1...,https://m.media-amazon.com/images/M/MV5BODQ0Mz...


In [152]:
len(movies_clean)

14067

In [153]:
movies_links = movies_clean[['title_year', 'img_url']]
movies_links.head()

Unnamed: 0,title_year,img_url
0,Hypocrites (1915),https://m.media-amazon.com/images/M/MV5BMjA5NT...
1,Tillie's Punctured Romance (1914),https://m.media-amazon.com/images/M/MV5BMTc5MT...
2,The Cheat (1915),https://m.media-amazon.com/images/M/MV5BMjEzMj...
3,"20,000 Leagues Under the Sea (1916)",https://m.media-amazon.com/images/M/MV5BODYxOT...
4,Shoulder Arms (1918),https://m.media-amazon.com/images/M/MV5BODQ0Mz...


In [154]:
movies_links = movies_links.rename(columns={'title_year':'title'})
movies_links.head()

Unnamed: 0,title,img_url
0,Hypocrites (1915),https://m.media-amazon.com/images/M/MV5BMjA5NT...
1,Tillie's Punctured Romance (1914),https://m.media-amazon.com/images/M/MV5BMTc5MT...
2,The Cheat (1915),https://m.media-amazon.com/images/M/MV5BMjEzMj...
3,"20,000 Leagues Under the Sea (1916)",https://m.media-amazon.com/images/M/MV5BODYxOT...
4,Shoulder Arms (1918),https://m.media-amazon.com/images/M/MV5BODQ0Mz...


In [155]:
len(movies_links)

14067

In [156]:
movies_tot = pd.merge(movies, movies_links, how='left', on='title')
movies_tot.head()

Unnamed: 0,movieId,title,genres,img_url
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,Grumpier Old Men (1995),Comedy|Romance,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II (1995),Comedy,https://m.media-amazon.com/images/M/MV5BOTEyNz...


In [157]:
len(movies_tot)

58100

In [158]:
movies_tot.tail()

Unnamed: 0,movieId,title,genres,img_url
58095,193876,The Great Glinka (1946),(no genres listed),
58096,193878,Les tribulations d'une caissiÃ¨re (2011),Comedy,
58097,193880,Her Name Was Mumu (2016),Drama,
58098,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi,
58099,193886,Leal (2018),Action|Crime|Drama,


In [160]:
movies_tot = movies_tot.fillna(0)
movies_tot.tail()

Unnamed: 0,movieId,title,genres,img_url
58095,193876,The Great Glinka (1946),(no genres listed),0
58096,193878,Les tribulations d'une caissiÃ¨re (2011),Comedy,0
58097,193880,Her Name Was Mumu (2016),Drama,0
58098,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi,0
58099,193886,Leal (2018),Action|Crime|Drama,0


In [161]:
len(movies_tot)

58100

In [162]:
movies_tot.to_csv('output/movies_urls.csv')

In [14]:
movies = pd.read_csv("output/movies_urls.csv",low_memory=False, encoding='latin-1')
del movies['Unnamed: 0']
movies.head()

Unnamed: 0,movieId,title,genres,img_url
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,Grumpier Old Men (1995),Comedy|Romance,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II (1995),Comedy,https://m.media-amazon.com/images/M/MV5BOTEyNz...


In [28]:
len(movies)

58100

In [163]:
# Reading ratings dataset into a pandas dataframe object.
ratings = pd.read_csv("output/ratings_short.csv", low_memory=False, encoding='latin-1')
del ratings["Unnamed: 0"]
ratings.head()

Unnamed: 0,userId,movieId,rating
0,283228,54286,4.5
1,283228,3578,5.0
2,283228,44761,4.5
3,283228,34405,4.5
4,283228,8712,4.5


In [164]:
len(ratings)

1000000

In [165]:
# Getting number of users and movies from the dataset.
user_ids = ratings.userId.unique().tolist()
movie_ids = ratings.movieId.unique().tolist()

print('Number of Uniq-Users: {}'.format(len(user_ids)))
print('Number of Movies: {}'.format(len(movie_ids)))

Number of Uniq-Users: 10095
Number of Movies: 21285


In [166]:
ratings['userId'].astype('int')

0         283228
1         283228
2         283228
3         283228
4         283228
           ...  
999995    273134
999996    273134
999997    273134
999998    273134
999999    273134
Name: userId, Length: 1000000, dtype: int64

In [167]:
min_uid = min(ratings['userId'])
max_uid = max(ratings['userId'])           
print(min_uid, max_uid)

273134 283228


In [168]:
print(len(ratings['userId'].unique().tolist()), len(ratings['userId']))

10095 1000000


In [169]:
inna_list = ['Futurama', 'Matrix', 'Shrek', 'Fifth Element', 'Minority Report', 'Mars Attack', 'Amelie']
inna_rating = [5, 4.7, 4.8, 4.9, 4.6, 4.3, 4.1]

In [170]:
titles = movies['title'].unique().tolist()
actual_titles=[]

for title in titles:
    for i in range(len(inna_list)):
        if inna_list[i] in title:
            actual_titles.append(title)
actual_titles

['Mars Attacks! (1996)',
 'Fifth Element, The (1997)',
 'Matrix, The (1999)',
 'Shrek (2001)',
 "Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le) (2001)",
 'Minority Report (2002)',
 'Matrix Reloaded, The (2003)',
 'Matrix Revolutions, The (2003)',
 'Shrek 2 (2004)',
 'Shrek the Third (2007)',
 "Futurama: Bender's Big Score (2007)",
 'Futurama: The Beast with a Billion Backs (2008)',
 "Futurama: Bender's Game (2008)",
 'Shrek the Halls (2007)',
 'Futurama: Into the Wild Green Yonder (2009)',
 'Shrek Forever After (a.k.a. Shrek: The Final Chapter) (2010)',
 'Scared Shrekless (2010)',
 'Shrek the Musical (2013)',
 'Return to Source: The Philosophy of The Matrix (2004)',
 'Armitage: Dual Matrix (2002)',
 'The Matrix Revisited (2001)',
 'Futurama: The Lost Adventure (2008)',
 'The Living Matrix (2009)',
 'Matrix of Evil (2003)']

In [171]:
import random
ext_ratings=[]
for title in actual_titles:
    for i in range(len(inna_rating)):
        rf = (random.randint(0, 1))/10
        if inna_list[i] in title:
            ext_ratings.append(inna_rating[i]+rf)
ext_ratings

[4.3,
 4.9,
 4.8,
 4.8999999999999995,
 4.199999999999999,
 4.699999999999999,
 4.7,
 4.8,
 4.8,
 4.8999999999999995,
 5.0,
 5.0,
 5.0,
 4.8999999999999995,
 5.0,
 4.8,
 4.8,
 4.8,
 4.8,
 4.8,
 4.7,
 5.0,
 4.7,
 4.8]

In [172]:
inna_id = ratings['userId'][0]+1
inna_id

283229

In [173]:
movieids= movies['movieId'].unique().tolist()
mtitles = movies['title'].unique().tolist()
found_movieids=[]

for title in actual_titles:
    for i in range(len(mtitles)):
        if title==mtitle[i]:
            found_movieids.append(movieids[i])
found_movieids

[1391,
 1527,
 2571,
 4306,
 4973,
 5445,
 6365,
 6934,
 8360,
 53066,
 56174,
 60147,
 62925,
 64243,
 66282,
 78626,
 81690,
 109941,
 132450,
 157627,
 172151,
 175519,
 179359,
 180957]

In [174]:
print(len(found_movieids), len(actual_titles), len(ext_ratings))

24 24 24


In [175]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,283228,54286,4.5
1,283228,3578,5.0
2,283228,44761,4.5
3,283228,34405,4.5
4,283228,8712,4.5


In [176]:
inna_df = pd.DataFrame({"userId": inna_id, "movieId":found_movieids, "rating":ext_ratings})
inna_df

Unnamed: 0,userId,movieId,rating
0,283229,1391,4.3
1,283229,1527,4.9
2,283229,2571,4.8
3,283229,4306,4.9
4,283229,4973,4.2
5,283229,5445,4.7
6,283229,6365,4.7
7,283229,6934,4.8
8,283229,8360,4.8
9,283229,53066,4.9


In [177]:
ratings = ratings.append(inna_df, sort=False)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,283228,54286,4.5
1,283228,3578,5.0
2,283228,44761,4.5
3,283228,34405,4.5
4,283228,8712,4.5


In [178]:
ratings.tail(50)

Unnamed: 0,userId,movieId,rating
999974,273135,3,3.0
999975,273135,527,5.0
999976,273135,11,3.0
999977,273135,17,4.0
999978,273135,7,3.0
999979,273135,36,4.0
999980,273135,43,4.0
999981,273135,50,4.0
999982,273135,62,3.0
999983,273135,110,4.0


In [179]:
len(ratings)

1000024

In [180]:
ratings.to_csv("output/ratings_short_vi.csv")

In [181]:
ratings = pd.read_csv("output/ratings_short_vi.csv")
del ratings["Unnamed: 0"]
ratings.tail()

Unnamed: 0,userId,movieId,rating
1000019,283229,157627,4.8
1000020,283229,172151,4.7
1000021,283229,175519,5.0
1000022,283229,179359,4.7
1000023,283229,180957,4.8


In [182]:
len(ratings)

1000024

In [183]:
user_ids = ratings['userId'].unique().tolist()
michaels_id = max(user_ids)+1
michaels_id

283230

In [184]:
michaels_list = ['Day of the Dead', 'A Clockwork Orange', 'Re-Animator', 'Cannibal Holocaust', 'Hedwig and the Angry Inch', 'The Burning', 'Eraserhead', 'Turbo Kid', 'The Ghost and Mr. Chicken', 'House of Wax']
michaels_rating = [5.0, 5.0, 4.7, 4.5, 4.3, 4.5, 4.6, 4.4, 4.3, 4.8]


In [185]:
titles = movies['title'].unique().tolist()
actual_titles=[]

for title in titles:
    for i in range(len(michaels_list)):
        if michaels_list[i] in title:
            actual_titles.append(title)
actual_titles

['Bride of Re-Animator (1990)',
 'Re-Animator (1985)',
 'Eraserhead (1977)',
 'Hedwig and the Angry Inch (2000)',
 'House of Wax (1953)',
 'Day of the Dead (1985)',
 'Beyond Re-Animator (2003)',
 'Cannibal Holocaust (1980)',
 'Candyman 3: Day of the Dead (1999)',
 'House of Wax (2005)',
 'Day of the Dead 2: Contagium (2005)',
 'Day of the Dead (2008)',
 'The Burning Dead (2015)',
 'Turbo Kid (2015)',
 'The Burning Hills (1956)',
 'The Burning Court (1962)',
 'Cannibal Holocaust II (1988)',
 'Day of the Dead: Bloodline (2018)']

In [186]:
ext_ratings=[]
for title in actual_titles:
    for i in range(len(michaels_rating)):
        rf = (random.randint(0, 1))/10
        if michaels_list[i] in title:
            ext_ratings.append(michaels_rating[i]+rf)
ext_ratings

[4.8,
 4.8,
 4.6,
 4.3,
 4.8999999999999995,
 5.0,
 4.7,
 4.6,
 5.1,
 4.8,
 5.1,
 5.1,
 4.5,
 4.5,
 4.5,
 4.6,
 4.6,
 5.0]

In [187]:
movieids= movies['movieId'].unique().tolist()
mtitles = movies['title'].unique().tolist()
found_movieids=[]

for title in actual_titles:
    for i in range(len(mtitles)):
        if title==mtitle[i]:
            found_movieids.append(movieids[i])
found_movieids

[3013,
 3018,
 3676,
 4642,
 6629,
 6731,
 7202,
 8906,
 27135,
 33160,
 56432,
 66983,
 139725,
 141493,
 142084,
 150948,
 161181,
 183321]

In [188]:
print(len(found_movieids), len(actual_titles), len(ext_ratings))

18 18 18


In [189]:
michaels_df = pd.DataFrame({"userId": michaels_id, "movieId":found_movieids, "rating":ext_ratings})
michaels_df

Unnamed: 0,userId,movieId,rating
0,283230,3013,4.8
1,283230,3018,4.8
2,283230,3676,4.6
3,283230,4642,4.3
4,283230,6629,4.9
5,283230,6731,5.0
6,283230,7202,4.7
7,283230,8906,4.6
8,283230,27135,5.1
9,283230,33160,4.8


In [190]:
ratings = ratings.append(michaels_df, sort=False)
ratings.tail(20)

Unnamed: 0,userId,movieId,rating
1000022,283229,179359,4.7
1000023,283229,180957,4.8
0,283230,3013,4.8
1,283230,3018,4.8
2,283230,3676,4.6
3,283230,4642,4.3
4,283230,6629,4.9
5,283230,6731,5.0
6,283230,7202,4.7
7,283230,8906,4.6


In [191]:
len(ratings)

1000042

In [192]:
user_ids = ratings.userId.unique().astype('int')

us_start = min(user_ids)
us_end = max(user_ids)
us_start_fake = 1
us_end_fake = us_end - us_start

print(us_end_fake, us_end)

10096 283230


In [193]:
ratings.to_csv("output/ratings_short_vi.csv")

In [195]:
ratings = pd.read_csv("output/ratings_short_vi.csv")
del ratings["Unnamed: 0"]
ratings.tail()

Unnamed: 0,userId,movieId,rating
1000037,283230,141493,4.5
1000038,283230,142084,4.5
1000039,283230,150948,4.6
1000040,283230,161181,4.6
1000041,283230,183321,5.0


In [196]:
user_ids = ratings['userId'].unique().tolist()
ken_id = max(user_ids)+1
ken_id

283231

In [197]:
ken_list = ['Groundhog Day', 'The Martian', 'Princess Bride', 'Casablanca']
ken_rating = [5.0, 5.0, 4.8, 4.6]

In [198]:
titles = movies['title'].unique().tolist()
actual_titles=[]

for title in titles:
    for i in range(len(ken_list)):
        if ken_list[i] in title:
            actual_titles.append(title)
actual_titles

['Casablanca (1942)',
 'Princess Bride, The (1987)',
 'Groundhog Day (1993)',
 'Night in Casablanca, A (1946)',
 'Casablanca Express (1989)',
 'The Martian (2015)',
 "In Casablanca, the Angels Don't Fly (2004)"]

In [199]:
ext_ratings=[]
for title in actual_titles:
    for i in range(len(ken_rating)):
        rf = (random.randint(0, 1))/10
        if ken_list[i] in title:
            ext_ratings.append(ken_rating[i]+rf)
ext_ratings

[4.6, 4.8999999999999995, 5.0, 4.6, 4.699999999999999, 5.0, 4.6]

In [200]:
movieids= movies['movieId'].unique().tolist()
mtitles = movies['title'].unique().tolist()
found_movieids=[]

for title in actual_titles:
    for i in range(len(mtitles)):
        if title==mtitle[i]:
            found_movieids.append(movieids[i])
found_movieids

[912, 1197, 1265, 7933, 119830, 134091, 180543]

In [201]:
print(len(found_movieids), len(actual_titles), len(ext_ratings))

7 7 7


In [202]:
ken_df = pd.DataFrame({"userId": ken_id, "movieId":found_movieids, "rating":ext_ratings})
ken_df

Unnamed: 0,userId,movieId,rating
0,283231,912,4.6
1,283231,1197,4.9
2,283231,1265,5.0
3,283231,7933,4.6
4,283231,119830,4.7
5,283231,134091,5.0
6,283231,180543,4.6


In [203]:
ratings = ratings.append(ken_df, sort=False)
ratings.tail(20)

Unnamed: 0,userId,movieId,rating
1000029,283230,6731,5.0
1000030,283230,7202,4.7
1000031,283230,8906,4.6
1000032,283230,27135,5.1
1000033,283230,33160,4.8
1000034,283230,56432,5.1
1000035,283230,66983,5.1
1000036,283230,139725,4.5
1000037,283230,141493,4.5
1000038,283230,142084,4.5


In [204]:
len(ratings)

1000049

In [205]:
user_ids = ratings.userId.unique().astype('int')

us_start = min(user_ids)
us_end = max(user_ids)
us_start_fake = 1
us_end_fake = us_end - us_start

print(us_end_fake, us_end)

10097 283231


In [206]:
ratings.to_csv("output/ratings_short_vi.csv")