# Question 2: Hands-on demostration

## Installing pymongo (if not installed already)
Start Anaconda command prompt. Then,
- Execute the following: `conda install -c anaconda pymongo`
- After the installation is complete, execute the following: `pip install 'pymongo[srv]'` 

After the installations are done, close the command prompts, then shutdown Jupyter. Then, restart it.

In [None]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import time

In [None]:
conn_string = "mongodb+srv://movielens:movielens123@cluster0.dadyq.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"

client = MongoClient(conn_string)

# Option 1: Normalized schema

In [None]:
db = client.ML_Option_1

## Males

In [None]:
# Send the query once
data = pd.DataFrame(list(db.USERS.aggregate([


        {
            '$match': {
                'gender': 'M'
            }
        }, {
            '$lookup': {
                'from': 'RATINGS', 
                'localField': 'user_id', 
                'foreignField': 'user_id', 
                'as': 'ratings'
            }
        }, {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'movie_id': '$ratings.movie_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }


    ])

                               ))

# Send it 5 times to find the mean execution time

option1_males_exe_times = []

for i in range(0,5):
    start_time = time.time()

    data = pd.DataFrame(list(db.USERS.aggregate([


        {
            '$match': {
                'gender': 'M'
            }
        }, {
            '$lookup': {
                'from': 'RATINGS', 
                'localField': 'user_id', 
                'foreignField': 'user_id', 
                'as': 'ratings'
            }
        }, {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'movie_id': '$ratings.movie_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }


    ])

                               ))
    end_time = time.time()
    option1_males_exe_times.append(end_time - start_time)
    print(f'Iteration {i} execution time: {end_time - start_time}')

option1_males_mean_time = np.mean(option1_males_exe_times)

print(f'Mean execution time: {option1_males_mean_time}')

In [None]:
utility_matrix_males = pd.pivot_table(data, values='rating', index=['user_id'],
                    columns=['movie_id'], aggfunc=np.mean, fill_value=0)

utility_matrix_males 

## Females

In [None]:
# Send the query once:

data = pd.DataFrame(list(db.USERS.aggregate([


        {
            '$match': {
                'gender': 'F'
            }
        }, {
            '$lookup': {
                'from': 'RATINGS', 
                'localField': 'user_id', 
                'foreignField': 'user_id', 
                'as': 'ratings'
            }
        }, {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'movie_id': '$ratings.movie_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }


    ])

                               ))


# Send it 5 times:

option1_females_exe_times = []

for i in range(0,5):
    start_time = time.time()

    data = pd.DataFrame(list(db.USERS.aggregate([


        {
            '$match': {
                'gender': 'F'
            }
        }, {
            '$lookup': {
                'from': 'RATINGS', 
                'localField': 'user_id', 
                'foreignField': 'user_id', 
                'as': 'ratings'
            }
        }, {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'movie_id': '$ratings.movie_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }


    ])

                               ))

    end_time = time.time()
    option1_females_exe_times.append(end_time - start_time)
    print(f'Iteration {i} execution time: {end_time - start_time}')

option1_females_mean_time = np.mean(option1_females_exe_times)

print(f'Mean execution time: {option1_females_mean_time}')

In [None]:
utility_matrix_females = pd.pivot_table(data, values='rating', index=['user_id'],
                    columns=['movie_id'], aggfunc=np.mean, fill_value=0)

utility_matrix_females 

# Option 2: Ratings are subdocuments of movies

In [None]:
db = client.ML_Option_2

## Males

In [None]:
# Send the query once:
data = pd.DataFrame(list(db.MOVIES.aggregate([

        {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$lookup': {
                'from': 'USERS', 
                'localField': 'ratings.user_id', 
                'foreignField': 'user_id', 
                'as': 'user'
            }
        }, {
            '$unwind': {
                'path': '$user'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'user_id': '$user.user_id'
            }
        }, {
            '$match': {
                'user.gender': 'M'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }

    ])

                     ))

# Send the query multiple times

option2_males_exe_times = []

for i in range(0,5):
    start_time = time.time()

    data = pd.DataFrame(list(db.MOVIES.aggregate([

        {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$lookup': {
                'from': 'USERS', 
                'localField': 'ratings.user_id', 
                'foreignField': 'user_id', 
                'as': 'user'
            }
        }, {
            '$unwind': {
                'path': '$user'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'user_id': '$user.user_id'
            }
        }, {
            '$match': {
                'user.gender': 'M'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }

    ])

                     ))

    end_time = time.time()
    option2_males_exe_times.append(end_time - start_time)
    print(f'Iteration {i} execution time: {end_time - start_time}')

option2_males_mean_time = np.mean(option2_males_exe_times)

print(f'Mean execution time: {option2_males_mean_time}')

In [None]:
data

In [None]:
utility_matrix_males = pd.pivot_table(data, values='rating', index=['user_id'],
                    columns=['movie_id'], aggfunc=np.mean, fill_value=0)

utility_matrix_males

## Females

In [None]:
# Send the query once:
data = pd.DataFrame(list(db.MOVIES.aggregate([

        {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$lookup': {
                'from': 'USERS', 
                'localField': 'ratings.user_id', 
                'foreignField': 'user_id', 
                'as': 'user'
            }
        }, {
            '$unwind': {
                'path': '$user'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'user_id': '$user.user_id'
            }
        }, {
            '$match': {
                'user.gender': 'F'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }

    ])

                     ))

#Send the query multiple times:

option2_females_exe_times = []

for i in range(0,5):
    start_time = time.time()

    data = pd.DataFrame(list(db.MOVIES.aggregate([

        {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$lookup': {
                'from': 'USERS', 
                'localField': 'ratings.user_id', 
                'foreignField': 'user_id', 
                'as': 'user'
            }
        }, {
            '$unwind': {
                'path': '$user'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'user_id': '$user.user_id'
            }
        }, {
            '$match': {
                'user.gender': 'F'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }

    ])

                     ))

    end_time = time.time()
    option2_females_exe_times.append(end_time - start_time)
    print(f'Iteration {i} execution time: {end_time - start_time}')

option2_females_mean_time = np.mean(option2_females_exe_times)

print(f'Mean execution time: {option2_females_mean_time}')

In [None]:
data

In [None]:
utility_matrix_females = pd.pivot_table(data, values='rating', index=['user_id'],
                    columns=['movie_id'], aggfunc=np.mean, fill_value=0)

utility_matrix_females

# Option 3: Ratings are subdocuments of users

In [None]:
db = client.ML_Option_3

## Males

In [None]:
# Send the query once:
data = pd.DataFrame(list(db.USERS.aggregate([

        {
            '$match': {
                'gender': 'M'
            }
        }, {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'movie_id': '$ratings.movie_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }

    ])

                     ))


# Send the query multiple times
option3_males_exe_times = []

for i in range(0,5):
    start_time = time.time()

    data = pd.DataFrame(list(db.USERS.aggregate([

        {
            '$match': {
                'gender': 'M'
            }
        }, {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'movie_id': '$ratings.movie_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }

    ])

                     ))

    end_time = time.time()
    option3_males_exe_times.append(end_time - start_time)
    print(f'Iteration {i} execution time: {end_time - start_time}')

option3_males_mean_time = np.mean(option3_males_exe_times)

print(f'Mean execution time: {option3_males_mean_time}')

In [None]:
utility_matrix_males = pd.pivot_table(data, values='rating', index=['user_id'],
                    columns=['movie_id'], aggfunc=np.mean, fill_value=0)

utility_matrix_males

## Females

In [None]:
# Send the query once:
data = pd.DataFrame(list(db.USERS.aggregate([

        {
            '$match': {
                'gender': 'F'
            }
        }, {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'movie_id': '$ratings.movie_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }

    ])

                     ))


# Send the query multiple times

option3_females_exe_times = []

for i in range(0,5):
    start_time = time.time()

    data = pd.DataFrame(list(db.USERS.aggregate([

        {
            '$match': {
                'gender': 'F'
            }
        }, {
            '$unwind': {
                'path': '$ratings'
            }
        }, {
            '$addFields': {
                'rating': '$ratings.rating', 
                'movie_id': '$ratings.movie_id'
            }
        }, {
            '$project': {
                'user_id': 1, 
                'movie_id': 1, 
                'rating': 1
            }
        }

    ])

                     ))

    end_time = time.time()
    option3_females_exe_times.append(end_time - start_time)
    print(f'Iteration {i} execution time: {end_time - start_time}')

option3_females_mean_time = np.mean(option3_females_exe_times)

print(f'Mean execution time: {option3_females_mean_time}')

In [None]:
utility_matrix_females = pd.pivot_table(data, values='rating', index=['user_id'],
                    columns=['movie_id'], aggfunc=np.mean, fill_value=0)

utility_matrix_females

# Comparison of execution times - males


In [None]:
print('Males - Option 1 mean execution time in seconds: {}'.format(round(option1_males_mean_time, 2)))
print('Males - Option 2 mean execution time in seconds: {}'.format(round(option2_males_mean_time, 2)))
print('Males - Option 3 mean execution time in seconds: {}'.format(round(option3_males_mean_time, 2)))

# Comparison of execution times - females


In [None]:
print('Females - Option 1 mean execution time in seconds: {}'.format(round(option1_females_mean_time, 2)))
print('Females - Option 2 mean execution time in seconds: {}'.format(round(option2_females_mean_time, 2)))
print('Females - Option 3 mean execution time in seconds: {}'.format(round(option3_females_mean_time, 2)))

# Optional: Collaborative Filtering

In [None]:
from sklearn import metrics

cosine = metrics.pairwise.cosine_similarity(utility_matrix_females)

cosine

In [None]:
cosine_df = pd.DataFrame(cosine, index=utility_matrix_females.index.values, columns=utility_matrix_females.index.values)

cosine_df

In [None]:
# Retrieve the first user's similarity scores (column-wise)
cosine_df.iloc[[0]]

In [None]:
# Identify the highest similarity score of this user - by skipping the diagonal value

np.max(cosine_df.iloc[0][1:])

In [None]:
# Identify the index value of this user
index_of_most_similar = cosine_df.loc[cosine_df.iloc[0] == np.max(cosine_df.iloc[0][1:])].index.values[0]

index_of_most_similar

In [None]:
cosine_df.loc[[index_of_most_similar]]

In [None]:
#Retrieve the movies rated by this user from higest to lowest

user_item = utility_matrix_females.loc[701].to_dict()

user_item

In [None]:
sorted_dict = {k: v for k, v in sorted(user_item.items(), key=lambda item: item[1], reverse=True)}

sorted_dict

In [None]:
utility_matrix_females.iloc[[0]]

In [None]:
# Movies not watched by user 1

not_watched = utility_matrix_females.iloc[0][utility_matrix_females.iloc[0]==0].index.values

not_watched

In [None]:
# Iterate through the dictionary to make recommendations

five_star = []
four_star = []

for key in sorted_dict:
    if key in not_watched:
        if sorted_dict[key]==5:
            five_star.append(key)
        if sorted_dict[key]==4:
            four_star.append(key)
    

In [None]:
five_star

In [None]:
four_star