In [1]:
import pandas as pd
import numpy as np
import os
import time
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [2]:
import pickle

def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)
        
def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [3]:
movies_path = os.path.join("ml-25m", "movies.csv")
ratings_path = os.path.join("ml-25m", "ratings.csv")

user_mean_path = os.path.join("ml-25m", "user_mean.csv")
ratings_avg_path = os.path.join("ml-25m", "ratings_avg.csv")
ratings_avg2_path = os.path.join("ml-25m", "ratings_real_avg.csv")

final_path = os.path.join("ml-25m", "final.csv")
cosine_path = os.path.join("ml-25m", "cosine.csv")

pivot_table_path = os.path.join("ml-25m", "pivot_table.sparse")
fixed_pivot_table_path = os.path.join("ml-25m", "fixed_pivot_table.sparse")

Collecting movies and users IDs

In [4]:
iteration = 0
movies_ids = np.array([])
users_ids = np.array([])
headers_tm = time.time()
for chunk in pd.read_csv(ratings_path, chunksize=100000):
    movies_ids = np.append(movies_ids, chunk['movieId'].unique())
    users_ids = np.append(users_ids, chunk['userId'].unique())
    iteration += 1
movies_ids = np.unique(movies_ids) 
users_ids = np.unique(users_ids) 
print("Took {:.3f}".format(time.time()-headers_tm))

Took 7.079


In [8]:
print("iterations: ", iteration)
print("dif movies are: ", movies_ids.shape)
print("dif users are: ", users_ids.shape)

iterations:  251
dif movies are:  (59047,)
dif users are:  (162541,)


## Creating PivotTable
---

In [None]:
starting_tm = time.time()
pivot_table = csr_matrix((len(users_ids), len(movies_ids)))
for chunk in pd.read_csv(ratings_path, chunksize=100000):
    loop_tm = time.time()
    rows = [i-1 for i in chunk['userId'].tolist()]
    cols = [np.where(movies_ids == j)[0][0] for j in chunk['movieId']]
    ratings = chunk['rating'].tolist()
    avg_ratings = sum(ratings)/len(ratings)
    fixed_ratings = [rate - avg_ratings for rate in ratings]
    vals = fixed_ratings
    pivot_table[rows, cols] = vals
    print("Took {:.3f}".format(time.time()-loop_tm))
print("Took {:.3f}".format(time.time()-starting_tm))

In [5]:
#pickle_store(pivot_table, pivot_table_path)
pivot_table = pickle_load(pivot_table_path)
print("PivoTable was stored")

PivoTable was stored


# USER-Based

---

## Getting Predictions

In [12]:
Rating_avg = pd.read_csv(ratings_path)
Rating_avg = Rating_avg.astype({"movieId": str})
Movie_user = Rating_avg.groupby(by = 'userId')['movieId'].apply(lambda x:','.join(x))

### Get target and similar users

In [56]:
target_user_id =  1
print(pivot_table.shape, "--", pivot_table.getrow(target_user_id-1).shape)

(162541, 59047) -- (1, 59047)


In [None]:
similarity_tm = time.time()
user_similarities = cosine_similarity(pivot_table, pivot_table.getrow(target_user_id-1))
print("Took {:.3f}".format(time.time()-similarity_tm))
print(user_similarities.shape)
user_similarities

In [15]:
similar_users = (-user_similarities).argsort(axis=0)
most_similar_users = (similar_users[1:11]).squeeze().tolist()  # the first is itself

### Get the movies which similar users has seen
    

In [16]:
movies_seen_by_similar_users = []
for user in most_similar_users:
    movies_seen_by_similar_users.extend(pivot_table.getrow(user).nonzero()[1])
movies_seen_by_similar_users = set(movies_seen_by_similar_users)
print(len(movies_seen_by_similar_users))

217


In [17]:
movies_seen_by_target_user = pivot_table.getrow(target_user_id-1).nonzero()[1]
print(len(movies_seen_by_target_user))

70


In [18]:
movies_under_consideration = list(set(movies_seen_by_similar_users) - set(movies_seen_by_target_user))
print(len(movies_under_consideration))

196


In [37]:
movie_avg_ratings = []
for movie in movies_under_consideration:
    movie_ratings = pivot_table[most_similar_users, movie].toarray().squeeze().tolist()
    # movie_avg_ratings.append(sum(movie_ratings)/len(movie_ratings))
    movie_avg_ratings.append(sum([r*user_similarities[sid][0] for r,sid in zip(movie_ratings, most_similar_users)])/sum([user_similarities[sid][0] for sid in most_similar_users]))

In [38]:
best_movies_indexes = np.array(movie_avg_ratings).argsort()[:4].tolist()
predictions = [int(movies_ids[idx]) for idx in best_movies_indexes]
print(predictions)

[180, 90, 164, 28]


# ITEM-Based
---

In [43]:
target_user_id2 =  1

(1, 59047)

Movies that target user has not seen

In [49]:
target_ratings2 = pivot_table.getrow(target_user_id -1)
movies_that_user_has_seen2 = pivot_table.getrow(target_user_id2 -1).nonzero()[1].tolist()

In [68]:
target_movie_id2 = 2  # example: predict rating for that movie
print(pivot_table.transpose().shape)
print(pivot_table.getcol(target_movie_id2).transpose().shape)

(59047, 162541)
(1, 162541)


In [69]:
similarity_movies2 = cosine_similarity(pivot_table.transpose(), pivot_table.getcol(target_movie_id2).transpose())


Top 20 similar movies that have been rated by target user

In [83]:
similar_movies2 = (-similarity_movies2).argsort(axis=0)
most_similar_movies2 = (similar_movies2[1:]).squeeze().tolist()  # put here the max selected movies
print(len(most_similar_movies2))
accepted_movies2 = []
for movie in most_similar_movies2:
    if movie in movies_that_user_has_seen2:
        accepted_movies2.append(movie)
print(len(accepted_movies2))

accepted_movies2 = [ movie for movie in most_similar_movies2 if movie in movies_that_user_has_seen2]
print(len(accepted_movies2))

59046
70
70


Predicting the rate for that movie

In [88]:
print(accepted_movies2[0])
print(target_user_id2)

print("user rating of that movie is ", pivot_table[target_user_id-1, accepted_movies2[0]])
print("similarity of that movie with the target", similarity_movies2[accepted_movies2[0]])

1923
1
user rating of that movie is  -1.085315
similarity of that movie with the target [0.09717981]


In [96]:
movie_rating2 = sum([pivot_table[target_user_id-1, movie]*similarity_movies2[movie] for movie in accepted_movies2]) \
               / sum([similarity_movies2[movie] for movie in accepted_movies2])


In [97]:
movie_rating2


array([-179.80693675])

## Sum Up

---

In [17]:
def predict_rating_for_movie(target_movie_id, movies_seen_by_target_user):
    prediction_time = time.time()
    similarity_movies = cosine_similarity(pivot_table.transpose(), pivot_table.getcol(target_movie_id).transpose())
    similar_movies = (-similarity_movies).argsort(axis=0)
    most_similar_movies = (similar_movies[1:]).squeeze().tolist()  # put here the max selected movies
    accepted_movies = [ movie for movie in most_similar_movies if movie in movies_seen_by_target_user]
    movie_rating = sum([pivot_table[target_user_id-1, movie]*similarity_movies[movie] for movie in accepted_movies]) \
               / sum([similarity_movies[movie] for movie in accepted_movies])
    # print("Prediction took took {:.3f}".format(time.time()-prediction_time))
    # print(len(accepted_movies))
    return movie_rating[0]
    

In [19]:
# def item_based_prediction(target_user_id):
#     print("Item prediction started!!")
#     process_tm = time.time()
#     movies_seen_by_target_user = pivot_table.getrow(target_user_id -1).nonzero()[1].tolist()
#     movies_rate_predictions = []
#     for target_movie_id, i in enumerate(range(len(movies_ids[:100]))):
#         if target_movie_id in movies_seen_by_target_user:
#             continue
#         movies_rate_predictions.append(
#             (target_movie_id, predict_rating_for_movie(target_movie_id, movies_seen_by_target_user))
#         )
#         # break
#         if i/2%10 == 0:
#             print("processing at: ",i,"% ")
#     movies_rate_predictions = sorted(movies_rate_predictions,reverse=True, key=lambda tup: tup[1])[:20]  ## einai apo mikro pros megalo
#     predicted_movies = [movies_ids[movie_index] for movie_index, _ in movies_rate_predictions]
# 
#     print("Whole process took {:.3f}".format(time.time()-process_tm))
#     print("Results: ", movies_rate_predictions)
#     print("Real Results: ", predicted_movies)
#     return movies_rate_predictions

def item_based_prediction(target_user_id):
    print("Item prediction started!!")
    process_tm = time.time()
    movies_seen_by_target_user = pivot_table.getrow(target_user_id -1).nonzero()[1].tolist()
    movies_seen_by_target_user.sort(key=lambda x: pivot_table[target_user_id -1, x], reverse=True)
    movies_rate_predictions = []
    already_checked_movies = []
    k=0
    stop = False
    while not stop:
        print("starting ", len(movies_seen_by_target_user))
        for movie_id in movies_seen_by_target_user:
            similarity_movies = cosine_similarity(pivot_table.transpose(), pivot_table.getcol(movie_id).transpose())
            most_similar_movie_id = (-similarity_movies).argsort(axis=0)[k][0]
            if most_similar_movie_id in already_checked_movies or most_similar_movie_id in movies_seen_by_target_user:
                continue
            movies_rate_predictions.append(
                (most_similar_movie_id, predict_rating_for_movie(most_similar_movie_id, movies_seen_by_target_user))
            )
            if len(movies_rate_predictions) == 30:
                stop = True
                break
        k += 1    
    movies_rate_predictions = sorted(movies_rate_predictions,reverse=True, key=lambda tup: tup[1])[:20]  ## einai apo mikro pros megalo
    predicted_movies = [movies_ids[movie_index] for movie_index, _ in movies_rate_predictions]
    
    print("Whole process took {:.3f}".format(time.time()-process_tm))
    print("Results: ", movies_rate_predictions)
    print("Real Results: ", predicted_movies)
    # return movies_rate_predictions

In [20]:
pivot_table = pickle_load(fixed_pivot_table_path)
target_user_id = 2
item_based_prediction(target_user_id)



Item prediction started!!
starting  184
starting  184
Whole process took 218.111
Results:  [(12216, 0.7517878753699102), (2867, 0.7259864364683587), (5883, 0.4982818148323797), (6258, 0.46938209597074304), (1207, 0.4576075376739843), (1194, 0.42864570209536246), (5877, 0.41911245188673873), (5877, 0.41911245188673873), (933, 0.3919466621246719), (2636, 0.3682344116397693), (7022, 0.3288935035843121), (8102, 0.2545896210390017), (10834, 0.25213114130225994), (510, 0.20787551243349503), (1263, 0.16363469677740655), (159, 0.1576216286188244), (1579, 0.09084247120488896), (5336, 0.06966177204339695), (764, -0.005950814621861656), (4518, -0.04983421687639545)]
Real Results:  [58559.0, 2959.0, 5995.0, 6377.0, 1240.0, 1225.0, 5989.0, 5989.0, 954.0, 2728.0, 7147.0, 8817.0, 45722.0, 515.0, 1297.0, 161.0, 1641.0, 5444.0, 780.0, 4623.0]


Get the top 50 favourite movies of the user

In [6]:
movies_seen_by_target_user = pivot_table.getrow(target_user_id -1).nonzero()[1].tolist()

In [7]:
movies_seen_by_target_user.sort(key=lambda x: pivot_table[target_user_id -1, x], reverse=True)


In [14]:
most_likely_to_like = []
k2=0
stop = False
while not stop:
    print("starting")
    for movie_id2 in movies_seen_by_target_user:
        similarity_movies2 = cosine_similarity(pivot_table.transpose(), pivot_table.getcol(movie_id2).transpose())
        most_similar_movie2= (-similarity_movies2).argsort(axis=0)[k2][0]
        most_likely_to_like.append(most_similar_movie2)
        print(len(most_likely_to_like))
        if len(most_likely_to_like) == 10:
            stop = True
            break
    print("k is incresingggg")
    k2 += 1

starting
1
2
3
4
5
6
7
8
9
10
k is incresingggg


In [11]:
print(most_likely_to_like)
print(len(most_likely_to_like))

[108, 257, 314, 328, 452, 522, 1108, 1166, 1167, 1179]
10
