In [1]:
import pandas as pd
import json

# Read configuration file
with open('config_prod.json', 'r') as fp:
    config = json.load(fp)
print(config)

{'dir_data_raw': '../data/raw', 'dir_data_input': '../data/input', 'db_url': 'cluster0.egjki.mongodb.net', 'db_name': 'gmam', 'db_user': 'getmeamovie_rw'}


## Create model_movie_index

In [None]:
df_movies = pd.read_csv(config['dir_data_input'] + '/movies.csv')
df_movies = df_movies[df_movies.num_ratings >= 100]  # Keep only movies with 100+ reviews

# Important: model_movie_index contains the ids of the films included in the similarity matrix in the same order
model_movie_index = df_movies['movieId'].tolist()

## Generate simulated matrix of similarity

In [None]:
import numpy as np
import random

# Generate random similarity matrix for testing
random.seed(1234)
num_movies = len(df_movies)
sim_matrix = np.zeros((num_movies, num_movies), dtype=float)
for i in range(num_movies):
    sim_matrix[i,i] = 1
    for j in range(i):
        # Generate random similarity between 0 and 1
        sim = random.random()
        sim_matrix[i,j] = sim
        sim_matrix[j,i] = sim
sim_matrix[0:4, 0:4]

In [14]:
import numpy as np
sim_matrix = np.loadtxt('content_based_matrix.txt')

In [9]:
def encode_matrix(matrix, id_list, num_digits=2):
    '''Encode similarity matrix as list of strings
       Every string will correspond to a row of the matrix, with each value coded as a fixed number of digits
       Input:
           - matrix: 2-dimensional array of float values between 0 and 1
           - id_list: list of id values
           - num_digits: number of digits for each value (default 2)
       Returns:
           - matrix_out: formatted matrix (list of strings)
    '''
    if not (matrix.shape[0] == matrix.shape[1] and matrix.shape[0] == len(id_list)):
        raise ValueError('[encode_matrix] The matrix dimensions must match with the size of the id_list')
        
    matrix_out = []
    d_movies = []
    for i in range(len(matrix)):
        row_string=''
        for j in range(len(matrix)):
            x = matrix[i,j]
            if x<0 or x>1:
                raise ValueError('[encode_matrix] The matrix values must be between 0 and 1 - error in value [{},{}] '.format(i,j))
            x_int = int(x*10**num_digits)  # Convert into n-digit integer
            if x_int==10**num_digits:  # Values of 1 will be converted to 0.99
                x_int -= 1
            row_string = row_string + format(x_int, '0'+str(num_digits)+'d')
        matrix_out.append({'movieId':id_list[i], 'similarities':row_string})
    return matrix_out


In [15]:
model_movie_index = pd.read_csv('content_based_index.txt')

In [16]:
f_sim_matrix = encode_matrix(sim_matrix, model_movie_index['movieId'].to_list(), num_digits=4)

In [17]:
f_sim_matrix[0:5]

[{'movieId': 1,
  'similarities': '99990064002800000000000000000000000000000000000000000000000000000064011900000000004300000034000000000000000001340000002700000080000000000262000000000000000000350000000000400000003000000061025800000000007700840076000001320035000000310000003000000000000000000000000000000000000000000000004400230000000000000047006101140000000000000000003700700000004400000026000000000000000000000023000000000000004600000061000000000000000000000000000000000000000000000000000000000000000000000000000000000047008100000000000000000000000000000000000000720000000000000031003100000000000000000000000000530000000000000000000000000000010900000057000000000039000000000000022300000000000000000000000000520000000000000000000000000000004900000000000000000000000000000000003500000042000000000000000000000000008100000000000000230000009100000000010600900000009200360000000000000023000000000000000000000000000000000000000000000000000000000063000000000000000000470000008100950000000000000000000000000

## Upload similarity matrix to DB
**Note:** Requires the file "db_credentials.txt" containing the password of the MongoDB user. See password in the pinned file in the Slack general group.

In [18]:
# Connect to MongoDB

db_url = config['db_url']
db_name = config['db_name']
db_user = config['db_user']

db_password = 'EiWZwBedc6UTLEDr'
import pymongo
import ssl
try:
    # Close previous connection
    if 'conn' in globals():
        conn.close()
        print("Closing connection")
    
    # Read from db_credentials.txt password required to connect to MongoDB.
    #with open("db_credentials.txt", 'r') as f:
    #    [db_password] = f.read().splitlines()
    
    # Connect
    conn=pymongo.MongoClient("mongodb+srv://{}:{}@{}".format(db_user, db_password, db_url), ssl_cert_reqs=ssl.CERT_NONE)
    print ("Connected successfully to MongoDB")
    
except pymongo.errors.ConnectionFailure as e:
    print ("Could not connect to MongoDB: %s" % e) 
    
# Open database and collection
db = conn[db_name]
col_similarity = db['similarity_content_based']

col_similarity.delete_many({})  # Delete previous data in the collection

col_similarity.insert_many(f_sim_matrix)  # Insert formatted similarity matrix

# Close connection to MongoDB
conn.close()

Closing connection
Connected successfully to MongoDB
