# Question 1: Hands-on demostration

## Installing pymongo (if not installed already)
Start Anaconda command prompt. Then,
- Execute the following: `conda install -c anaconda pymongo`
- After the installation is complete, execute the following: `pip install 'pymongo[srv]'` 

After the installations are done, close the command prompts, then shutdown Jupyter. Then, restart it.

In [None]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import time

In [None]:
conn_string = "mongodb+srv://movielens:movielens123@cluster0.dadyq.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"

client = MongoClient(conn_string)

# Option 1: Normalized schema

In [None]:
db = client.ML_Option_1

In [None]:
# Send the query 6 times. Though, the first one will be used to warm-up the database
# The query execution times of the last 5 will be captured and averaged.

option1_exe_times = []

for i in range(0,6):
    start_time = time.time()

    data = pd.DataFrame(list(db.RATINGS.find({},{'movie_id': 1,
                                                 'user_id': 1, 
                                                 'rating': 1})))
    if i != 0:
        end_time = time.time()
        option1_exe_times.append(end_time - start_time)
        print(f'Iteration {i} execution time: {end_time - start_time}')

option1_mean_time = np.mean(option1_exe_times)

print(f'Mean execution time: {option1_mean_time}')

In [None]:
data

In [None]:
utility_matrix = pd.pivot_table(data, values='rating', index=['user_id'],
                    columns=['movie_id'], aggfunc=np.mean, fill_value=0)

utility_matrix 

# Option 2: Ratings are subdocuments of movies

In [None]:
db = client.ML_Option_2

In [None]:
# Send the query 6 times. Though, the first one will be used to warm-up the database
# The query execution times of the last 5 will be captured and averaged.

option2_exe_times = []

for i in range(0,6):
    start_time = time.time()

    data = pd.DataFrame(list(db.MOVIES.aggregate([
        {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'user_id': '$ratings.user_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }
    ])
                     ))
    
    if i != 0:
        end_time = time.time()
        option2_exe_times.append(end_time - start_time)
        print(f'Iteration {i} execution time: {end_time - start_time}')

option2_mean_time = np.mean(option2_exe_times)

print(f'Mean execution time: {option2_mean_time}')

In [None]:
data

In [None]:
utility_matrix = pd.pivot_table(data, values='rating', index=['user_id'],
                    columns=['movie_id'], aggfunc=np.mean, fill_value=0)

utility_matrix

# Option 3: Ratings are subdocuments of users

In [None]:
db = client.ML_Option_3

In [None]:
# Send the query 6 times. Though, the first one will be used to warm-up the database
# The query execution times of the last 5 will be captured and averaged.

option3_exe_times = []

for i in range(0,6):
    start_time = time.time()

    data = pd.DataFrame(list(db.USERS.aggregate([
        {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'movie_id': '$ratings.movie_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }
    ])
                     ))
    
    if i != 0:
        end_time = time.time()
        option3_exe_times.append(end_time - start_time)
        print(f'Iteration {i} execution time: {end_time - start_time}')

option3_mean_time = np.mean(option3_exe_times)

print(f'Mean execution time: {option3_mean_time}')

In [None]:
data

In [None]:
utility_matrix = pd.pivot_table(data, values='rating', index=['user_id'],
                    columns=['movie_id'], aggfunc=np.mean, fill_value=0)

utility_matrix

# Comparison of execution times


In [None]:
print(f'Option 1 mean execution time in seconds: {round(option1_mean_time, 2)}')
print(f'Option 2 mean execution time in seconds: {round(option2_mean_time, 2)}')
print(f'Option 3 mean execution time in seconds: {round(option3_mean_time, 2)}')

# Optional: Collaborative Filtering

In [None]:
from sklearn import metrics

cosine = metrics.pairwise.cosine_similarity(utility_matrix)

In [None]:
cosine

In [None]:
cosine_df = pd.DataFrame(cosine, index=utility_matrix.index.values, 
                         columns=utility_matrix.index.values)

cosine_df

In [None]:
# Retrieve the first user's similarity scores (column-wise)
cosine_df.iloc[[0]]

In [None]:
# Identify the highest similarity score of this user - by skipping the diagonal value

np.max(cosine_df.iloc[0][1:])

In [None]:
# Identify the index value of this user
index_of_most_similar = cosine_df.loc[cosine_df.iloc[0] == np.max(cosine_df.iloc[0][1:])].index.values[0]

index_of_most_similar

In [None]:
cosine_df.loc[[index_of_most_similar]]

In [None]:
#Retrieve the movies rated by this user

user_item = utility_matrix.loc[916].to_dict()

user_item

In [None]:
# Sort these movies from highest rated to lowest

sorted_dict = {k: v for k, v in sorted(user_item.items(), key=lambda item: item[1], reverse=True)}

sorted_dict

In [None]:
# Find movies not watched by user 1

not_watched = utility_matrix.iloc[0][utility_matrix.iloc[0]==0].index.values

not_watched

In [None]:
# Iterate through the dictionary to make recommendations

five_star = []
four_star = []

for key in sorted_dict:
    if key in not_watched:
        if sorted_dict[key]==5:
            five_star.append(key)
        if sorted_dict[key]==4:
            four_star.append(key)
    

In [None]:
five_star

In [None]:
four_star