In [1]:
import pandas as pd
import numpy as np
import os
import time
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
import pickle

def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)
        
def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [5]:
movies_path = os.path.join("ml-25m", "movies.csv")
ratings_path = os.path.join("ml-25m", "ratings.csv")

user_mean_path = os.path.join("ml-25m", "user_mean.csv")
ratings_avg_path = os.path.join("ml-25m", "ratings_avg.csv")
ratings_avg2_path = os.path.join("ml-25m", "ratings_real_avg.csv")

final_path = os.path.join("ml-25m", "final.csv")
cosine_path = os.path.join("ml-25m", "cosine.csv")

pivot_table_path = os.path.join("ml-25m", "pivot_table.sparse")

Collecting movies and users IDs

In [None]:
iteration = 0
movies_ids = np.array([])
users_ids = np.array([])
headers_tm = time.time()
for chunk in pd.read_csv(ratings_path, chunksize=100000):
    movies_ids = np.append(movies_ids, chunk['movieId'].unique())
    users_ids = np.append(users_ids, chunk['userId'].unique())
    iteration += 1
movies_ids = np.unique(movies_ids) 
users_ids = np.unique(users_ids) 
print("Took {:.3f}".format(time.time()-headers_tm))

In [8]:
print("iterations: ", iteration)
print("dif movies are: ", movies_ids.shape)
print("dif users are: ", users_ids.shape)

iterations:  251
dif movies are:  (59047,)
dif users are:  (162541,)


### Creating PivotTable

In [None]:
starting_tm = time.time()
pivot_table = csr_matrix((len(users_ids), len(movies_ids)))
for chunk in pd.read_csv(ratings_path, chunksize=100000):
    loop_tm = time.time()
    rows = [i-1 for i in chunk['userId'].tolist()]
    cols = [np.where(movies_ids == j)[0][0] for j in chunk['movieId']]
    ratings = chunk['rating'].tolist()
    avg_ratings = sum(ratings)/len(ratings)
    fixed_ratings = [rate - avg_ratings for rate in ratings]
    vals = fixed_ratings
    pivot_table[rows, cols] = vals
    print("Took {:.3f}".format(time.time()-loop_tm))
print("Took {:.3f}".format(time.time()-starting_tm))

In [5]:
#pickle_store(pivot_table, pivot_table_path)
pivot_table = pickle_load(pivot_table_path)
print("PivoTable was stored")

PivoTable was stored


# USER-Based

---

## Getting Predictions

In [12]:
Rating_avg = pd.read_csv(ratings_path)
Rating_avg = Rating_avg.astype({"movieId": str})
Movie_user = Rating_avg.groupby(by = 'userId')['movieId'].apply(lambda x:','.join(x))

### Get target and similar users

In [56]:
target_user_id =  1
print(pivot_table.shape, "--", pivot_table.getrow(target_user_id-1).shape)

(162541, 59047) -- (1, 59047)


In [None]:
similarity_tm = time.time()
user_similarities = cosine_similarity(pivot_table, pivot_table.getrow(target_user_id-1))
print("Took {:.3f}".format(time.time()-similarity_tm))
print(user_similarities.shape)
user_similarities

In [15]:
similar_users = (-user_similarities).argsort(axis=0)
most_similar_users = (similar_users[1:11]).squeeze().tolist()  # the first is itself

### Get the movies which similar users has seen
    

In [16]:
movies_seen_by_similar_users = []
for user in most_similar_users:
    movies_seen_by_similar_users.extend(pivot_table.getrow(user).nonzero()[1])
movies_seen_by_similar_users = set(movies_seen_by_similar_users)
print(len(movies_seen_by_similar_users))

217


In [17]:
movies_seen_by_target_user = pivot_table.getrow(target_user_id-1).nonzero()[1]
print(len(movies_seen_by_target_user))

70


In [18]:
movies_under_consideration = list(set(movies_seen_by_similar_users) - set(movies_seen_by_target_user))
print(len(movies_under_consideration))

196


In [37]:
movie_avg_ratings = []
for movie in movies_under_consideration:
    movie_ratings = pivot_table[most_similar_users, movie].toarray().squeeze().tolist()
    # movie_avg_ratings.append(sum(movie_ratings)/len(movie_ratings))
    movie_avg_ratings.append(sum([r*user_similarities[sid][0] for r,sid in zip(movie_ratings, most_similar_users)])/sum([user_similarities[sid][0] for sid in most_similar_users]))

In [38]:
best_movies_indexes = np.array(movie_avg_ratings).argsort()[:4].tolist()
predictions = [int(movies_ids[idx]) for idx in best_movies_indexes]
print(predictions)

[180, 90, 164, 28]


# ITEM-Based
---

In [43]:
target_user_id =  1

(1, 59047)

Movies that target user has not seen

In [49]:
target_ratings = pivot_table.getrow(target_user_id -1)
movies_that_user_has_seen = pivot_table.getrow(target_user_id -1).nonzero()[1].tolist()

In [68]:
target_movie_id = 2  # example
print(pivot_table.transpose().shape)
print(pivot_table.getcol(target_movie_id).transpose().shape)

(59047, 162541)
(1, 162541)


In [69]:
similarity_movies = cosine_similarity(pivot_table.transpose(), pivot_table.getcol(target_movie_id).transpose())


Top 20 similar movies that have been rated by target user

In [83]:
indicative_movies = []
similar_movies = (-similarity_movies).argsort(axis=0)
most_similar_movies = (similar_movies[1:]).squeeze().tolist()  # put here the max selected movies
print(len(most_similar_movies))
accepted_movies = []
for movie in most_similar_movies:
    if movie in movies_that_user_has_seen:
        accepted_movies.append(movie)
print(len(accepted_movies))

accepted_movies = [ movie for movie in most_similar_movies if movie in movies_that_user_has_seen]
print(len(accepted_movies))

59046
70
70


Predicting the rate for that movie

In [88]:
print(accepted_movies[0])
print(target_user_id)

print("user rating of that movie is ", pivot_table[target_user_id-1, accepted_movies[0]])
print("similarity of that movie with the target", similarity_movies[accepted_movies[0]])

1923
1
user rating of that movie is  -1.085315
similarity of that movie with the target [0.09717981]


In [96]:
movie_rating = sum([pivot_table[target_user_id-1, movie]*similarity_movies[movie] for movie in accepted_movies]) \
               / sum([similarity_movies[movie] for movie in accepted_movies])


In [97]:
movie_rating


array([-179.80693675])

## Sum Up

In [98]:
def predict_rating_for_movie(target_movie_id, movies_seen_by_target_user):
    prediction_time = time.time()
    similarity_movies = cosine_similarity(pivot_table.transpose(), pivot_table.getcol(target_movie_id).transpose())
    similar_movies = (-similarity_movies).argsort(axis=0)
    most_similar_movies = (similar_movies[1:]).squeeze().tolist()  # put here the max selected movies
    accepted_movies = []
    for movie in most_similar_movies:
        if movie in movies_that_user_has_seen:
            accepted_movies.append(movie)
    print(len(accepted_movies))
    accepted_movies = [ movie for movie in most_similar_movies if movie in movies_that_user_has_seen]
    movie_rating = sum([pivot_table[target_user_id-1, movie]*similarity_movies[movie] for movie in accepted_movies]) \
               / sum([similarity_movies[movie] for movie in accepted_movies])
    print("Prediction took took {:.3f}".format(time.time()-prediction_time))
    return movie_rating
    

In [99]:
def item_based_prediction(target_user_id):
    process_tm = time.time()
    movies_seen_by_target_user = pivot_table.getrow(target_user_id -1).nonzero()[1].tolist()
    movies_rate_predictions = []
    for target_movie_id in range(len(movies_ids)):
        if movie in movies_that_user_has_seen:
            continue
        movies_rate_predictions.append(
            (target_movie_id, predict_rating_for_movie(target_movie_id, movies_seen_by_target_user))
        )
    movies_rate_predictions = sorted(movies_rate_predictions, key=lambda tup: tup[1])[:20]
    predicted_movies = [movies_ids[movie_index] for movie_index, _ in movies_rate_predictions]
    
    print("Whole process took {:.3f}".format(time.time()-process_tm))
    print("Results: ", movies_rate_predictions)
    print("Real Results: ", movies_rate_predictions)

    return movies_rate_predictions

In [100]:
     
 target_user_id = 1
 item_based_prediction(target_user_id)



TypeError: only integer scalar arrays can be converted to a scalar index