# Non-Personalized Recommendations

### What is a popular item?
"""Is it the item with most ratings total, or with the greatest number of high ratings?
You could also consider if the most popular items are more controversial, and look at what content items have the highest variance in ratings.
Decide on what chart you want to show, then calculate it globally and one for each genre. Save the results as CSV. 
"""

## My Approach to Non-Personalized Recommendations:
* Prefer movies that have had recent reviews - shows popular activity
    * People always want to know what has been watched lately
* Prefer movies with five ratings or more
* Score movie ratings by mean, down adjusted by low confidence level on the distribution (see below)
* All else equal, prefer movies with more reviews (but not always true; as we'll likely see when personal recommendations are done)

In [1]:
import re
import statistics
from itertools import chain
from collections import Counter  

from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 

In [2]:
# Data from https://github.com/sidooms/MovieTweetings
# user_id::movie_id::rating::rating_timestamp. 
rdf = pd.read_csv("MovieTweetings-master/latest/ratings.dat", 
                 sep="::", engine="python",
                 names=["user_id", "movie_id", "rating", "rating_timestamp"] )

rdf['rating_timestamp'] = pd.to_datetime(rdf['rating_timestamp'], unit='s')
rdf

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,2013-10-05 21:00:50
1,2,75314,1,2020-07-23 01:42:04
2,2,102926,9,2020-05-22 11:46:56
3,2,114369,10,2020-08-16 05:22:27
4,2,118715,8,2020-07-29 07:13:18
...,...,...,...,...
888447,69322,9784456,6,2020-07-27 00:40:13
888448,69322,9898858,3,2020-04-04 00:00:52
888449,69323,172495,10,2020-04-17 07:03:35
888450,69323,414387,10,2020-04-17 07:17:32


In [3]:
# Helper functions
strip_parens = re.compile(r"\s+\(.*\)")
text ="In My Room (2020)"
# strip_parens.sub("", text)
def drop_parens(text):
    return strip_parens.sub("", text)
def extract_year(text):
    return text[text.rfind("(") + 1 : text.rfind(")")]
# extract_year(text)
# extract_year('Remélem legközelebb sikerül meghalnod:) (2018)')

In [4]:
# movies.dat
# Contains the items (i.e., movies) that were rated in the tweets,
# together with their genre metadata in the following 
# format: movie_id::movie_title (movie_year)::genre|genre|genre. For example:

# 0110912::Pulp Fiction (1994)::Crime|Thriller

mdf = pd.read_csv("MovieTweetings-master/latest/movies.dat", 
                 sep="::", engine="python",
                 names=["movie_id", "movie_title", "genres"] )
mdf.genres.fillna(value='', inplace=True)
mdf['title'] = mdf.movie_title.apply(drop_parens)
mdf['movie_year'] = mdf.movie_title.apply(extract_year)
mdf.movie_year = mdf.movie_year.astype('int')
mdf['genre_list'] = mdf.genres.apply(lambda x: x.split("|"))
del mdf['movie_title']
mdf

Unnamed: 0,movie_id,genres,title,movie_year,genre_list
0,8,Documentary|Short,Edison Kinetoscopic Record of a Sneeze,1894,"[Documentary, Short]"
1,10,Documentary|Short,La sortie des usines Lumière,1895,"[Documentary, Short]"
2,12,Documentary|Short,The Arrival of a Train,1896,"[Documentary, Short]"
3,25,,The Oxford and Cambridge University Boat Race,1895,[]
4,91,Short|Horror,Le manoir du diable,1896,"[Short, Horror]"
...,...,...,...,...,...
36378,12749596,Horror,Host,2020,[Horror]
36379,12762684,Short|Drama,In My Room,2020,"[Short, Drama]"
36380,12875782,Action|Drama|Fantasy|Sci-Fi,Freaks: You're One of Us,2020,"[Action, Drama, Fantasy, Sci-Fi]"
36381,12888462,Documentary,My Octopus Teacher,2020,[Documentary]


In [5]:
genres = set(chain.from_iterable(mdf.genre_list.tolist()))
genres.remove('')
print(len(genres), genres)

28 {'Biography', 'Family', 'Drama', 'Adult', 'Horror', 'Music', 'War', 'Western', 'Action', 'Thriller', 'Short', 'Game-Show', 'Film-Noir', 'Reality-TV', 'Sport', 'Musical', 'History', 'Animation', 'Fantasy', 'Comedy', 'Adventure', 'Mystery', 'News', 'Documentary', 'Crime', 'Romance', 'Sci-Fi', 'Talk-Show'}


In [6]:
votes =[]
for idx, row in mdf.sample(100).iterrows():
    movie_id = row['movie_id']
    ratings = rdf.query(f'movie_id =={movie_id}')['rating'].tolist()
    votes.append( (row['title'], ratings ))

votes = [tmp for tmp in votes if len(tmp[1]) >3 ]
vote = votes[0]
print(len(vote[1]), sum(vote[1]), vote)

6 42 ('Cavalcade', [7, 8, 8, 7, 5, 7])


# Discounting the Mean Rating
The mean of ratings is a good start but we want a less confident value that is based on the edge of the vote distribution. 

In [7]:
nd = statistics.NormalDist.from_samples(vote[1])
nd.inv_cdf(0.02), round(nd.inv_cdf(0.02)), statistics.mean(vote[1]) 

(4.750230788390579, 5, 7)

### If we have enough ratings, we can create a Normal Distribution, and take the 2% value--so that we are 98% confident that the true mean value of the votes is at least that value.

In [8]:
votes_nums =[]
discounted_means = []
votes_means = []
votes_variances = [] 
latest_ratings = []
discount = 0.02
for movie_id in tqdm(mdf['movie_id'], total=len(mdf)):
    tmpdf  = rdf.query(f'movie_id == {movie_id}')
    ratings = tmpdf['rating'].tolist()
    newest_rating = sorted(tmpdf['rating_timestamp'].tolist(), reverse=True)[0]
    latest_ratings.append(int(newest_rating.strftime('%Y%M')))
    num_ratings = len(ratings)
    if num_ratings >= 2:
        sigma = statistics.stdev(ratings)
        mu = statistics.fmean(ratings)
        var = statistics.variance(ratings, xbar=mu)
        if sigma <= 0:
            discounted_means.append(0) 
            votes_nums.append(num_ratings)            
            votes_means.append(mu)
            votes_variances.append(var)
            continue
    else:
            discounted_means.append(0) 
            votes_nums.append(num_ratings)            
            votes_means.append(0)
            votes_variances.append(0)
            continue    
    discounted_means.append(round(statistics.NormalDist(mu=mu, sigma=sigma).inv_cdf(discount)))
    votes_nums.append(num_ratings)            
    votes_means.append(mu)
    votes_variances.append(var)

mdf['num_votes'] = votes_nums
mdf['discounted_mean'] = discounted_means
mdf['mean_vote'] = votes_means
mdf['vote_variance'] = votes_variances
mdf['latest_rating'] = latest_ratings

100%|██████████| 36383/36383 [02:19<00:00, 260.47it/s]


In [9]:
mdf

Unnamed: 0,movie_id,genres,title,movie_year,genre_list,num_votes,discounted_mean,mean_vote,vote_variance,latest_rating
0,8,Documentary|Short,Edison Kinetoscopic Record of a Sneeze,1894,"[Documentary, Short]",1,0,0.000000,0.000000,201420
1,10,Documentary|Short,La sortie des usines Lumière,1895,"[Documentary, Short]",1,0,0.000000,0.000000,201415
2,12,Documentary|Short,The Arrival of a Train,1896,"[Documentary, Short]",1,0,0.000000,0.000000,201516
3,25,,The Oxford and Cambridge University Boat Race,1895,[],1,0,0.000000,0.000000,201704
4,91,Short|Horror,Le manoir du diable,1896,"[Short, Horror]",3,4,6.000000,1.000000,201948
...,...,...,...,...,...,...,...,...,...,...
36378,12749596,Horror,Host,2020,[Horror],14,2,6.428571,4.263736,202052
36379,12762684,Short|Drama,In My Room,2020,"[Short, Drama]",1,0,0.000000,0.000000,202019
36380,12875782,Action|Drama|Fantasy|Sci-Fi,Freaks: You're One of Us,2020,"[Action, Drama, Fantasy, Sci-Fi]",6,4,5.666667,0.666667,202004
36381,12888462,Documentary,My Octopus Teacher,2020,[Documentary],5,5,7.800000,1.700000,202033


In [10]:
# Create CSVs
# Top.csv
# {genre_name}.csv
# as derived from: https://github.com/kimfalk/live-project/blob/master/recs/live_project_popularity_recommender.py
# What are the expected column headers?
# https://github.com/kimfalk/live-project/blob/master/recommender/views.py#L59

In [11]:
top = mdf.query("num_votes >= 5").sort_values(["latest_rating", "discounted_mean", "num_votes"], ascending=False).head(1000)
top.to_csv('Top.csv', index=False)
top

Unnamed: 0,movie_id,genres,title,movie_year,genre_list,num_votes,discounted_mean,mean_vote,vote_variance,latest_rating
10571,167261,Adventure|Drama|Fantasy,The Lord of the Rings: The Two Towers,2002,"[Adventure, Drama, Fantasy]",335,7,8.973134,1.307659,202059
2800,57058,Drama,Le feu follet,1963,[Drama],20,7,8.650000,0.976316,202059
11957,275277,Animation|Action|Crime|Drama|Sci-Fi|Thriller,Cowboy Bebop: Tengoku no tobira,2001,"[Animation, Action, Crime, Drama, Sci-Fi, Thri...",15,7,9.133333,1.409524,202059
21633,1920885,Animation|Adventure|Drama|Fantasy|Romance,Dayu haitang,2016,"[Animation, Adventure, Drama, Fantasy, Romance]",6,7,8.833333,0.566667,202059
8559,109830,Drama|Romance,Forrest Gump,1994,"[Drama, Romance]",894,6,9.032438,1.682037,202059
...,...,...,...,...,...,...,...,...,...,...
35314,8784956,Action|Crime|Drama|Mystery|Thriller,Ava,2020,"[Action, Crime, Drama, Mystery, Thriller]",20,1,4.450000,2.365789,202050
11076,209475,Comedy|Romance,The Wedding Planner,2001,"[Comedy, Romance]",17,1,6.117647,5.485294,202050
14371,436339,Animation|Action|Adventure|Comedy|Family|Fanta...,G-Force,2009,"[Animation, Action, Adventure, Comedy, Family,...",12,1,5.083333,3.901515,202050
33654,6811280,Short|Sci-Fi,Predicament in Sight,2017,"[Short, Sci-Fi]",11,0,6.000000,8.200000,202050


In [12]:
for genre in genres:
    top = mdf.query(f"genres.str.contains('{genre}') and num_votes >= 5").sort_values(["latest_rating", "discounted_mean", "num_votes"], ascending=False).head(100)
    if len(top) > 0:
        top.to_csv(f'{genre}.csv', index=False)