In [2]:
%load_ext autoreload
%autoreload 2
%pylab inline

from collections import defaultdict, OrderedDict, Counter
from copy import deepcopy
import time
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from scipy.stats import poisson
from datetime import date, datetime
import uuid
import os
import csv
import ujson
import requests
import io
from pprint import pprint
from dateutil import parser

from tputils import *

Populating the interactive namespace from numpy and matplotlib


In [3]:
ratings = csv_load("uucf_user_movie.csv", delimiter=";")
movie_names = ratings[0][1:]
user_names = [row[0] for row in ratings[1:]]
ratings = [row[1:] for row in ratings[1:]]
ratings = [list(map(lambda x: "NaN" if x == "" else x, row)) for row in ratings]
ratings = np.array(ratings, dtype=float)

nr_users, nr_movies = ratings.shape

print("user names", user_names)
print("movie_names", movie_names)
print(ratings)
print("nr_users", nr_users)
print("nr_movies", nr_movies)

assert len(user_names) == nr_users
assert len(movie_names) == nr_movies

user names ['1648', '5136', '918', '2824', '3867', '860', '3712', '2968', '3525', '4323', '3617', '4360', '2756', '89', '442', '3556', '5261', '2492', '5062', '2486', '4942', '2267', '4809', '3853', '2288']
movie_names ['11: Star Wars: Episode IV - A New Hope (1977)', '12: Finding Nemo (2003)', '13: Forrest Gump (1994)', '14: American Beauty (1999)', '22: Pirates of the Caribbean: The Curse of the Black Pearl (2003)', '24: Kill Bill: Vol. 1 (2003)', '38: Eternal Sunshine of the Spotless Mind (2004)', '63: Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', '77: Memento (2000)', '85: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', '98: Gladiator (2000)', '105: Back to the Future (1985)', '107: Snatch (2000)', '114: Pretty Woman (1990)', '120: The Lord of the Rings: The Fellowship of the Ring (2001)', '121: The Lord of the Rings: The Two Towers (2002)', '122: The Lord of the Rings: The Return of the King (2003)', '134: O Brother Where Art Thou? (2000)', '141: Don

# Some useful functions

In [4]:
def get_user_names(user_ids):
    return [user_names[user_id] for user_id in user_ids]

def get_movie_names(movie_ids):
    return [movie_names[movie_id] for movie_id in movie_ids]

user_name2id = {user_name: i for i, user_name in enumerate(user_names)}
movie_name2id = {movie_name: i for i, movie_name in enumerate(movie_names)}

# User similarity

In [5]:
from scipy.stats import pearsonr

user_sim = np.zeros((nr_users, nr_users))
for u in range(nr_users):
    for v in range(nr_users):
        mask = ~np.isnan(ratings[u, :]) & ~np.isnan(ratings[v, :])
        if sum(mask) > 0:
            pearson_r, p_value = pearsonr(ratings[u, mask], ratings[v, mask])
            user_sim[u, v] = pearson_r
        else:
            user_sim[u, v] = np.nan
        
        
assert abs(user_sim[user_name2id["1648"], user_name2id["5136"]] - 0.40298) < 1e-4, user_sim[user_name2id["1648"], user_name2id["5136"]]
assert abs(user_sim[user_name2id["918"], user_name2id["2824"]] - (-0.31706)) < 1e-4, user_sim[user_name2id["918"], user_name2id["2824"]]


# Most similar users

In [6]:
def get_most_similar_users(u, user_sim, n):
    
    users = list(range(u)) + list(range(u+1, nr_users))
    similarity = np.concatenate((user_sim[u, :u], user_sim[u, u+1:]))
    
    similarity_users = sorted(zip(similarity, users), reverse=True)[:n]
    similarity, users = zip(*similarity_users)
    return list(users), list(similarity)
    





user_ids, similarity = get_most_similar_users(user_name2id["3867"], user_sim, n=5)
pprint(list(zip(get_user_names(user_ids), similarity)))
      
user_ids, similarity = get_most_similar_users(user_name2id["89"], user_sim, n=5)
pprint(list(zip(get_user_names(user_ids), similarity)))
      
user_ids, similarity = get_most_similar_users(user_name2id["3712"], user_sim, n=5)
pprint(list(zip(get_user_names(user_ids), similarity)))

[('2492', 0.4766832805451795),
 ('3853', 0.46411014776485632),
 ('2486', 0.43899155441463594),
 ('3712', 0.40027450425381639),
 ('2288', 0.37985626502293046)]
[('4809', 0.66851595362183658),
 ('5136', 0.56244873874488843),
 ('860', 0.53906585139435725),
 ('5062', 0.52599044278760332),
 ('3525', 0.47549485227568483)]
[('2824', 0.46291004988627571),
 ('3867', 0.40027450425381639),
 ('5062', 0.24769327229404756),
 ('442', 0.2271298649307886),
 ('3853', 0.19365960183726968)]


In [29]:
users_avg_ratings = np.nanmean(ratings, axis=1)

# Predictions

In [51]:
n_most_similar_users = 5
top_n_movies = 100

def get_movies_predictions(u, ratings, user_sim, top_n_movies, normalized):
    neighbors, weights = get_most_similar_users(u, user_sim, n_most_similar_users)
    movies_predictions = [get_movie_prediction(u, m, ratings, neighbors, weights, normalized) for m in range(nr_movies)]
    
    movies_predictions = sorted(enumerate(movies_predictions), key=lambda x: x[1] if not  np.isnan(x[1]) else -100, reverse=True)
    movies_ids, movies_predictions = zip(*movies_predictions[:top_n_movies])
    
    return movies_ids, movies_predictions
        
def get_movie_prediction(u, m, ratings, neighbors, weights, normalized):
    neighbors_weights = [(n, w) for n, w in zip(neighbors, weights) if not np.isnan(ratings[n, m])]
    if len(neighbors_weights) == 0:
        return np.nan
    neighbors, weights = zip(*neighbors_weights)
    neighbors = list(neighbors)
    
    if normalized:
        movie_prediction = np.average(ratings[neighbors, m] - users_avg_ratings[neighbors], weights=weights)
        movie_prediction += users_avg_ratings[u]
    else:    
        movie_prediction = np.average(ratings[neighbors, m], weights=weights)
    
    return movie_prediction

user_id = user_name2id["89"]
movies, predictions = get_movies_predictions(user_id, ratings, user_sim, top_n_movies, normalized=True)
pprint(list(zip(get_movie_names(movies), predictions)))

[('238: The Godfather (1972)', 5.3220150330439706),
 ('278: The Shawshank Redemption (1994)', 5.2614236920709665),
 ('275: Fargo (1996)', 5.2411105801616555),
 ('807: Seven (a.k.a. Se7en) (1995)', 5.2019839141918345),
 ("424: Schindler's List (1993)", 5.1992230020594077),
 ('122: The Lord of the Rings: The Return of the King (2003)',
  5.1868459060526453),
 ('105: Back to the Future (1985)', 5.0730476796983268),
 ('550: Fight Club (1999)', 5.0699389280767129),
 ('13: Forrest Gump (1994)', 5.04907412942908),
 ('680: Pulp Fiction (1994)', 5.0468144938108459),
 ('120: The Lord of the Rings: The Fellowship of the Ring (2001)',
  5.0186833009116798),
 ('121: The Lord of the Rings: The Two Towers (2002)', 5.0186833009116798),
 ('77: Memento (2000)', 4.9776398501099095),
 ('329: Jurassic Park (1993)', 4.9479905062556027),
 ('568: Apollo 13 (1995)', 4.9359848968242321),
 ('8587: The Lion King (1994)', 4.933097664419841),
 ('38: Eternal Sunshine of the Spotless Mind (2004)', 4.9306256550140946)

In [43]:
a = np.array([1, 2, 3])
a[(2, 2, 1)]

IndexError: too many indices for array