In [1]:
# python 2.7 with anaconda

import numpy as np
from sklearn import datasets
from scipy.stats import mode
import scipy as sp
import pandas as pd

In [2]:
# Data set: Movie Lens 20M (http://grouplens.org/datasets/movielens/20m/)

genome_scores = pd.read_csv('ml-20m/genome-scores.csv')
genome_tags = pd.read_csv('ml-20m/genome-tags.csv')
links = pd.read_csv('ml-20m/links.csv')
movies = pd.read_csv('ml-20m/movies.csv')
ratings = pd.read_csv('ml-20m/ratings.csv')
tags = pd.read_csv('ml-20m/tags.csv')

In [3]:
# print(movies.info())

In [4]:
for ds in (genome_scores, genome_tags, links, movies, ratings, tags):
    display(ds.head())

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [5]:
def add_genres(movies):
    new_movies = movies.copy()
    
    for ix in range(0, len(movies)):
        genres = movies['genres'][ix]
        new_movies.at[ix, 'genres'] = [] if genres == "(no genres listed)" \
                                         else genres.split('|')

    return new_movies

movies_with_genres = add_genres(movies.head())
# display(movies_with_genres)

In [6]:
movies = movies_with_genres
display (ratings['rating'].min()) # 0.5 star - 5.0 stars

0.5

In [7]:
user_rating = ratings.head(1000000)\
              .pivot(index = 'userId', columns ='movieId', values = 'rating')\
              .fillna(0)
        
user_rating_mx = user_rating.as_matrix()

In [8]:
from sklearn.model_selection import train_test_split

# generate test data
def gen_test_data(user_rating_mx, test_size=0.25):
    data = []

    for i in xrange(user_rating_mx.shape[0]):
        for j in xrange(user_rating_mx.shape[1]):
            if user_rating_mx[i, j] > 0.0:
                data.append(((i, j), user_rating_mx[i, j]))
                
    _, test_data = train_test_split(data, test_size=test_size, random_state=2018)
    return zip(*test_data)

In [None]:
# test_x -> indices (in provided matrix) of a movie rating from a given user
# test_y -> rating
test_x, test_y = gen_test_data(user_rating_mx)

In [14]:
# display (user_rating.head())
display (user_rating_mx)
display (test_x[:5], test_y[:5])
display (user_rating_mx[455, 2723])
# display (ratings.tail(10))

array([[ 0. ,  3.5,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  4. , ...,  0. ,  0. ,  0. ],
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       ..., 
       [ 4. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. , ...,  0. ,  0. ,  0. ],
       [ 4. ,  3. ,  3. , ...,  0. ,  0. ,  0. ]])

((5765, 2349), (3108, 1245), (4361, 10680), (6738, 1747), (455, 2723))

(4.0, 4.0, 5.0, 1.0, 1.0)

1.0

In [None]:
display(user_rating.info())
display(user_rating.as_matrix()[0])

In [None]:
def pearson(x, y):
    if len(x.shape) == 1:
        x = x.reshape((1, -1))
    if len(y.shape) == 1:
        y = y.reshape((1, -1))
        
    prsn = np.zeros((x.shape[0], y.shape[0]))
    for xi in np.arange(x.shape[0]):
        for yi in np.arange(y.shape[0]):
            prsn[xi, yi] = sp.stats.pearsonr(x[xi,:], y[yi,:])[0]
    
    return prsn

In [None]:
x = user_rating.head(10)
a = pearson(np.array(x[:2]), np.array(x[:5]))
print(a)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

user_cos_dist = pairwise_distances(user_rating, metric='cosine', n_jobs=-1)
item_cos_dist = pairwise_distances(user_rating.T, metric='cosine', n_jobs=-1)

In [None]:
display(user_cos_dist)
display(item_cos_dist)

In [None]:
import gc 
gc.collect()

In [None]:
def evaluate_rmse(user_rating_mx, test_x test_y):
    

In [None]:
def user_based_predict(similarity, movie_ratings):
    """ predict single movie rating for a single user
        
        similarity    : vector of similariy of a given user to others 
        movie_ratings : each user movie rating (vector)
    """
    cnt = np.count_nonzero(movie_ratings)
    print 'cnt:', cnt
    # TODO:
    #  - normalize ratings
    #  - 
    return similarity.dot(movie_ratings) / float(cnt) 

In [None]:
display(user_based_predict(user_cos_dist[0], user_rating_mx[:,0]))

In [None]:
display (user_rating_mx.shape)
display (user_rating_mx.mean(axis=1))