In [1]:
import pandas as pd
import numpy as np
import json
import pymongo
import ssl
from joblib import dump, load
from sklearn.ensemble import GradientBoostingRegressor
import math

In [37]:
def get_similarity_row(movieId, db_connection, db_name, simMat_name, movie_list, num_digits=2):
    '''Obtain similarities of selected movie (will give error if movieId does not exist)'''
    db = db_connection[db_name]
    col_similarity = db[simMat_name]
    simil_string = col_similarity.find({'movieId':movieId})[0]['similarities']
    simil_row = []
    for i in range(len(movie_list)):
        x_char = simil_string[i*num_digits : (i+1)*num_digits]
        simil_row.append(int(x_char)/10**num_digits)  # Convert to number between 0 and 1
    return simil_row


def get_similarity(input_movies, sim_mat_name, sim_mat_digits):
    # Read configuration file
    with open('config_prod.json', 'r') as fp:
        config = json.load(fp)
    
    # Connect to MongoDB
    db_url = config['db_url']
    db_name = config['db_name']
    db_user = config['db_user']
    
    try:
        # Close previous connection
        if 'conn' in globals():
            conn.close()
            print("Closing connection")

        # Read from db_credentials.txt password required to connect to MongoDB.
        with open("db_credentials.txt", 'r') as f:
            [db_password] = f.read().splitlines()

        # Connect
        conn=pymongo.MongoClient("mongodb+srv://{}:{}@{}".format(db_user, db_password, db_url), ssl_cert_reqs=ssl.CERT_NONE)
        print ("Connected successfully to MongoDB")

    except pymongo.errors.ConnectionFailure as e:
        print ("Could not connect to MongoDB: %s" % e) 
    
    # Open database and collection
    db = conn[db_name]
    col_similarity = db[sim_mat_name]

    # Read movie ids from DB into a dataframe
    movie_list = pd.DataFrame(list(col_similarity.find( {}, {'movieId':1, '_id':0} )))

    # Read similarity matrix
    similarity = np.array([get_similarity_row(i, conn, db_name, sim_mat_name, movie_list, sim_mat_digits) for i in input_movies])

    return(movie_list, similarity)


def make_prediction(input_movies, input_ratings, output_movies, sim_mat_name='similarity_CF', sim_mat_digits=2):
    # Get similarity matrix for seen movies
    movie_list, similarity = get_similarity(input_movies, sim_mat_name, sim_mat_digits)
    
    # Subset to similarity matrix with movies to be rated
    sim_to_rate_bool = movie_list.isin(output_movies).values.reshape(-1)
    similarity = similarity[:,sim_to_rate_bool]
    
    # Compute predictions
    output_ratings1 = np.dot(input_ratings, similarity)/np.sum(similarity, axis=0)
    
    # Sort predictions by movie ID
    order = np.argsort(movie_list[sim_to_rate_bool].values.reshape(-1))
    output_ratings1 = output_ratings1[order]
    
    # If a movie has 0 similarity to all rated movies, give them a 0
    output_ratings1[np.isnan(output_ratings1)] = 0 
    
    return(output_ratings1)            


def combine_predictions(predCF, predCB, user_input, user_output):
    
    # Load the model 
    regr = load('GBR100.joblib') 
    
    # Other necessary parameters
    genres = ['no_genres', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
              'film-noir', 'horror', 'imax', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']

    directors = ['Alfred Hitchcock', 'Andrew Davis', 'Andrew Stanton', 'Barry Levinson', 'Barry Sonnenfeld', 'Brian De Palma', 'Bryan Singer', 'Chris Columbus',
           'Christopher Nolan', 'David Fincher', 'Francis Ford Coppola', 'Frank Darabont', 'George Lucas', 'James Cameron', 'Jan de Bont', 'Joel Coen',
           'Joel Schumacher', 'John McTiernan', 'Jonathan Demme', 'Lilly Wachowski', 'Martin Scorsese', 'Mel Gibson', 'Michael Bay', 'Peter Jackson',
           'Quentin Tarantino', 'Richard Donner', 'Ridley Scott', 'Rob Reiner', 'Robert Zemeckis', 'Roland Emmerich', 'Ron Howard', 'Sam Mendes',
           'Stanley Kubrick', 'Steven Spielberg', 'Terry Gilliam', 'Tim Burton', 'Tom Shadyac', 'Tony Scott', 'Wolfgang Petersen']

    directors2 = ['Steven Spielberg', 'Robert Zemeckis', 'Christopher Nolan', 'James Cameron', 'Quentin Tarantino', 'Peter Jackson', 'David Fincher', 'Ridley Scott',
           'Martin Scorsese', 'Tim Burton', 'Ron Howard', 'Frank Darabont', 'Francis Ford Coppola', 'George Lucas', 'Stanley Kubrick',
           'John McTiernan', 'Roland Emmerich', 'Chris Columbus', 'Jonathan Demme', 'Rob Reiner', 'Bryan Singer', 'Terry Gilliam', 'Joel Coen',
           'Lilly Wachowski', 'Barry Sonnenfeld', 'Alfred Hitchcock', 'Joel Schumacher', 'Tom Shadyac', 'Wolfgang Petersen', 'Jan de Bont',
           'Tony Scott', 'Richard Donner', 'Michael Bay', 'Mel Gibson', 'Brian De Palma', 'Andrew Stanton', 'Sam Mendes', 'Andrew Davis', 'Barry Levinson']

    # Codify input movies stats
    df = user_input
    input_stats = []
    for rate in [1, -1]:

        partial_stats = [len(df), np.std(df.year), np.std(df.num_ratings), np.std(df.avg_rating), np.std(df.runtimeMinutes), np.mean(df.year), 
                 np.mean(df.num_ratings), np.mean(df.avg_rating), np.sum(df.isAdult), np.mean(df.runtimeMinutes)]

        input_genres = np.array([item for sub_list in df.genres.values for item in sub_list])
        n_genres = [np.sum(input_genres == genre) for genre in genres]
        n_directors = [np.sum(df.director == director) for director in directors2]

        input_stats = input_stats + partial_stats + n_genres + n_directors


    # Codify output movies stats
    df = pd.concat([pd.DataFrame({'CF_prediction': predCF, 'CB_prediction': predCB}), user_output], axis=1)
    df = scale(df)
    output_genres = df.genres.values
    df.drop('genres',axis=1,inplace=True)
    df[genres] = 0
    df[directors] = 0
    df.loc[~df.director.isin(directors),'director'] = ''
    
    for i in range(len(df)):
        dirr = df.director[i]
        if dirr != '':
            df[dirr][i] = df[dirr][i] + 1
        
        for genre in output_genres[i]:   
            df[genre][i] = 1
            
    df.drop(['movieId','director'], axis=1, inplace=True)   
    df = df.to_numpy()
    input_stats = np.tile(np.array(input_stats), (df.shape[0], 1))

    df = np.concatenate((df, input_stats), axis=1)
    
    # Make predictions
    predictions = regr.predict(df)

    return predictions

def scale(df):
    minv = min(df.CF_prediction)
    maxv = max(df.CF_prediction)
    df.loc[np.isnan(df.CF_prediction),'CF_prediction'] = 0
    df.CF_prediction = 5*(df.CF_prediction - minv)/(maxv - minv)
    
    minv = min(df.CB_prediction)
    maxv = max(df.CB_prediction)
    df.CB_prediction = 5*(df.CB_prediction - minv)/(maxv - minv)   
    df.loc[np.isnan(df.CB_prediction),'CB_prediction'] = 0
    
    return df

def predictor(user_input, user_output):
    # Extract rated movies Id and rating from input
    input_movies = [mov['movieId'] for mov in user_input] 
    input_ratings = [mov['valoration'] for mov in user_input]
    user_input = pd.DataFrame(user_input)

    # Sort output movies by movie Id
    output_movies = [mov['movieId'] for mov in user_output] 
    output_movies = np.sort(output_movies)
    
    # Exclude from output movies in input (already seen)
    user_output = pd.DataFrame(user_output)
    user_output = user_output.loc[~ np.isin(output_movies, input_movies),:]
    user_output.reset_index(inplace=True, drop=True)
    output_movies = output_movies[~ np.isin(output_movies, input_movies)]
    
    # Predict using Collaborative filtering
    predCF = make_prediction(input_movies, input_ratings, output_movies, sim_mat_name='similarity_CF', sim_mat_digits=2)
    
    # Predict using Content Based recomender     
    predCB = make_prediction(input_movies, input_ratings, output_movies, sim_mat_name='similarity_content_based', sim_mat_digits=4)
    
    # Combine both predictions
    output_ratings = combine_predictions(predCF, predCB, user_input, user_output)

    # Sort predictions based on ratings
    output_movies = output_movies[np.argsort(-output_ratings)]
    
    return output_movies

In [38]:
# Example
with open('example_input.json', 'r') as fp:
    user_input = json.load(fp)

with open('example_output.json', 'r') as fp:
    user_output = json.load(fp)
    
sorted_movies = predictor(user_input, user_output)

Connected successfully to MongoDB
Connected successfully to MongoDB


  output_ratings1 = np.dot(input_ratings, similarity)/np.sum(similarity, axis=0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[dirr][i] = df[dirr][i] + 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[genre][i] = 1


In [39]:
sorted_movies


array([135532,  85367,   1882,   8446,  58162,   1291,    281,    934,
         7991,   1488,   4640, 110553,  27523,  33679, 108188,   3405,
        72701,   1732,   3175,   7000,   1616, 108156,  57980,   8927,
       171867, 133802,   5815,   1380,  40278,  32452,   3489,   1080,
        47937,   1446,  75985,   4441,   3726,    837,  26776, 114678,
         2968,    990,   6711,   6595,   1184,   6155, 104913,   1170,
         3565,    349,  48319,   6550,  87194,   4672,   6863,    145,
         7072, 106438, 159441,    911,   6951,   5990,   8604,   5013,
         2402,   7826,    940,  72919, 113250,   7445,  54290,  51255,
        54331,   4652,   3256, 158872,  82242,  79132,   7438,    617,
       149011, 115414,  99112,  81564, 112370,    356,    718,   2485,
       101761,   6806,  72378,   4184,  99910,     58, 116897,   8977,
        46865,   1353, 100326,  70533,  43558,   6959,  88812, 168498,
        47044, 141799,   6378,  94985,  43932,  36289,   8948, 175655,
      

In [23]:
len(user_output)

614

In [31]:
# Load the model 
regr = load('GBR100.joblib') 

# Other necessary parameters
genres = ['no_genres', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
          'film-noir', 'horror', 'imax', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']

directors = ['Alfred Hitchcock', 'Andrew Davis', 'Andrew Stanton', 'Barry Levinson', 'Barry Sonnenfeld', 'Brian De Palma', 'Bryan Singer', 'Chris Columbus',
       'Christopher Nolan', 'David Fincher', 'Francis Ford Coppola', 'Frank Darabont', 'George Lucas', 'James Cameron', 'Jan de Bont', 'Joel Coen',
       'Joel Schumacher', 'John McTiernan', 'Jonathan Demme', 'Lilly Wachowski', 'Martin Scorsese', 'Mel Gibson', 'Michael Bay', 'Peter Jackson',
       'Quentin Tarantino', 'Richard Donner', 'Ridley Scott', 'Rob Reiner', 'Robert Zemeckis', 'Roland Emmerich', 'Ron Howard', 'Sam Mendes',
       'Stanley Kubrick', 'Steven Spielberg', 'Terry Gilliam', 'Tim Burton', 'Tom Shadyac', 'Tony Scott', 'Wolfgang Petersen']

directors2 = ['Steven Spielberg', 'Robert Zemeckis', 'Christopher Nolan', 'James Cameron', 'Quentin Tarantino', 'Peter Jackson', 'David Fincher', 'Ridley Scott',
       'Martin Scorsese', 'Tim Burton', 'Ron Howard', 'Frank Darabont', 'Francis Ford Coppola', 'George Lucas', 'Stanley Kubrick',
       'John McTiernan', 'Roland Emmerich', 'Chris Columbus', 'Jonathan Demme', 'Rob Reiner', 'Bryan Singer', 'Terry Gilliam', 'Joel Coen',
       'Lilly Wachowski', 'Barry Sonnenfeld', 'Alfred Hitchcock', 'Joel Schumacher', 'Tom Shadyac', 'Wolfgang Petersen', 'Jan de Bont',
       'Tony Scott', 'Richard Donner', 'Michael Bay', 'Mel Gibson', 'Brian De Palma', 'Andrew Stanton', 'Sam Mendes', 'Andrew Davis', 'Barry Levinson']

# Codify input movies stats
df = pd.DataFrame(user_input)
input_stats = []
for rate in [1, -1]:

    partial_stats = [len(df), np.std(df.year), np.std(df.num_ratings), np.std(df.avg_rating), np.std(df.runtimeMinutes), np.mean(df.year), 
             np.mean(df.num_ratings), np.mean(df.avg_rating), np.sum(df.isAdult), np.mean(df.runtimeMinutes)]

    input_genres = np.array([item for sub_list in df.genres.values for item in sub_list])
    n_genres = [np.sum(input_genres == genre) for genre in genres]
    n_directors = [np.sum(df.director == director) for director in directors2]

    input_stats = input_stats + partial_stats + n_genres + n_directors


# Codify output movies stats
df = pd.concat([pd.DataFrame({'CF_prediction': predCF, 'CB_prediction': predCB}), pd.DataFrame(user_output)], axis=1)
df = scale(df)
output_genres = df.genres.values
df.drop('genres',axis=1,inplace=True)
df[genres] = 0
df[directors] = 0
df.loc[~df.director.isin(directors),'director'] = ''

In [32]:
df = pd.concat([pd.DataFrame({'CF_prediction': predCF, 'CB_prediction': predCB}), user_output], axis=1)
df.iloc[78:90]
#df = scale(df)
#df.iloc[78:90]
#user_output.iloc[78:90]

Unnamed: 0,CF_prediction,CB_prediction,index,movieId,year,num_ratings,avg_rating,isAdult,runtimeMinutes,director,genres
78,0.340206,-1.0,78,2587,1999,916,2.98,0,108,Ted Demme,"[comedy, crime, drama]"
79,0.291339,0.104,80,2959,1999,58773,4.23,0,139,David Fincher,"[action, crime, drama, thriller]"
80,0.263804,-0.066667,82,3565,2000,1865,3.28,0,120,Matt Williams,"[comedy, drama]"
81,0.341615,1.0,83,3623,2000,18337,2.98,0,123,John Woo,"[action, adventure, thriller]"
82,0.434211,1.0,84,2986,1990,4982,2.48,0,117,Irvin Kershner,"[action, crime, sci-fi, thriller]"
83,0.361963,-1.0,85,3256,1992,9075,3.62,0,117,Phillip Noyce,"[action, crime, drama, thriller]"
84,0.409091,-1.0,86,3306,1928,461,3.91,0,72,Charles Chaplin,[comedy]
85,0.361963,-1.0,87,3489,1991,9534,3.21,0,142,Steven Spielberg,"[adventure, comedy, fantasy]"
86,0.324503,0.0,88,3578,2000,44656,3.95,0,155,Ridley Scott,"[action, adventure, drama]"
87,0.373626,-1.0,89,3726,1976,1067,3.54,0,91,John Carpenter,"[action, thriller]"


In [33]:
df.shape

(614, 11)

In [26]:
df.shape

(617, 10)