In [1]:
#import sklearn.metrics.pairwise as pw
import re
#import spacy
import pandas as pd
import numpy as np
#import sklearn
import os
import json
import random

In [2]:
city = 'ambridge'

In [3]:
with open(f"data/{city}/business.json", "r") as f:
    data = []
    for line in f:
                data.append(json.loads(line))
with open(f"data/{city}/user.json", "r") as f:
    udata = []
    for line in f:
                udata.append(json.loads(line))
with open(f"data/{city}/review.json", "r") as f:
    rdata = []
    for line in f:
                rdata.append(json.loads(line))

In [4]:
def create_similarity_matrix_categories(matrix):
    """Create a  """
    npu = matrix.values
    m1 = npu @ npu.T
    diag = np.diag(m1)
    m2 = m1 / diag
    m3 = np.minimum(m2, m2.T)
    return pd.DataFrame(m3, index = matrix.index, columns = matrix.index)

In [5]:
def mse(predicted_ratings):
    """Computes the mean square error between actual ratings and predicted ratings
    
    Arguments:
    predicted_ratings -- a dataFrame containing the columns rating and predicted rating
    """
    diff = predicted_ratings['stars'] - predicted_ratings['predicted stars']
    return (diff**2).mean()

In [6]:
def predict_ratings(similarity, utility, to_predict):
    """Predicts the predicted rating for the input test data.
    
    Arguments:
    similarity -- a dataFrame that describes the similarity between items
    utility    -- a dataFrame that contains a rating for each user (columns) and each movie (rows). 
                  If a user did not rate an item the value np.nan is assumed. 
    to_predict -- A dataFrame containing at least the columns movieId and userId for which to do the predictions
    """
    # copy input (don't overwrite)
    ratings_test_c = to_predict.copy()
    # apply prediction to each row
    ratings_test_c['predicted stars'] = to_predict.apply(lambda row: predict_ids(similarity, utility, row['user_id'], row['business_id']), axis=1)
    return ratings_test_c

### Helper functions for predict_ratings_item_based ###

def predict_ids(similarity, utility, userId, itemId):
    # select right series from matrices and compute
    if userId in utility.columns and itemId in similarity.index:
        return predict_vectors(utility.loc[:,userId], similarity[itemId])
    return 0

def predict_vectors(user_ratings, similarities):
    # select only movies actually rated by user
    relevant_ratings = user_ratings.dropna()
    
    # select corresponding similairties
    similarities_s = similarities[relevant_ratings.index]
    
    # select neighborhood
    similarities_s = similarities_s[similarities_s > 0.0]
    relevant_ratings = relevant_ratings[similarities_s.index]
    
    # if there's nothing left return a prediction of 0
    norm = similarities_s.sum()
    if(norm == 0):
        return 0
    
    # compute a weighted average (i.e. neighborhood is all) 
    return np.dot(relevant_ratings, similarities_s)/norm

In [7]:
%%time
# maak utility matrix
utility_matrix = pd.DataFrame(index = [i['business_id'] for i in data], columns = [i['user_id'] for i in udata])

# utility_matrix invullen
rvs = [[review['business_id'], review['user_id'], review['stars']] for review in rdata]
for review in rvs:
    utility_matrix.loc[review[0], review[1]] = review[2]
utility_matrix.index.name = 'business_id'
utility_matrix.columns.name = 'user_id'

# maak similarity_matrix
tlist = [[data[i]['business_id'], data[i]['categories'].split(', ')[x]] for i in range(len(data))  if data[i]['categories'] != None for x in range(len(data[i]['categories'].split(', ')))]
catpd = pd.DataFrame(data=tlist, columns=['business_id', 'categories'])
aye = catpd.pivot_table(index = 'business_id', columns = 'categories', aggfunc = 'size', fill_value=0)
sim_matrix = create_similarity_matrix_categories(aye)

# DataFrame maken voor predict_ratings()
strtst = pd.DataFrame(rvs, columns = ['business_id', 'user_id', 'stars'])

# predict ratings
tsting = predict_ratings(sim_matrix, utility_matrix,  strtst[['user_id', 'business_id', 'stars']])

Wall time: 434 ms


In [8]:
mse(tsting)

0.09087292219400735

In [9]:
display(sim_matrix.head()), display(utility_matrix.head()), display(tsting.head())

business_id,-InU2nAbC9AuS-Um2Cowgw,0EyRe-VkW8gYxxZ7NCQXeQ,2_7lYF6P2cYKnSiPhhVyPQ,3gL18eXylqutlzqb6TmB0w,4mpSNvmyG89Uqy2ahP4JMQ,729grSa1Wsn-hfv7D5uOxg,7xRbOtZUuw7DxOWrHeaanw,EOmRHPvzR88a5R2j2uD0cQ,EZ9zSk4ld27LwgfANoF4VA,Eu_zPTrNVAXkpdSxf7CJ2w,...,lawYwEXAE-Sq2nf6co7aBg,muFJIZKZwbAfy_pEFKF_pw,pcbVl6ZHDOJHsnetY0rJEQ,rdHO0LkiNe6s3716hPuQXQ,tTDxa8OzmxUkpVifQ-cEWw,t_EiW3FlMnFTHyoeVU79xg,xM8dVGLkYaL94EuAIkjMEA,y3IVqEFHmrkgVKj2x1Ci4w,yyGzYDh0Qa2o8vUGMEjDRA,zBeUDwWx73QTZ34A1l0adQ
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-InU2nAbC9AuS-Um2Cowgw,1.0,0.5,0.25,0.0,0.25,0.25,0.25,0.0,0.0,0.25,...,0.0,0.25,0.0,0.25,0.25,0.333333,0.0,0.0,0.0,0.25
0EyRe-VkW8gYxxZ7NCQXeQ,0.5,1.0,0.25,0.0,0.25,0.666667,0.333333,0.0,0.0,0.333333,...,0.0,0.666667,0.0,0.25,0.333333,0.166667,0.0,0.0,0.0,0.666667
2_7lYF6P2cYKnSiPhhVyPQ,0.25,0.25,1.0,0.0,0.25,0.25,0.25,0.0,0.0,0.5,...,0.5,0.25,0.0,0.75,0.25,0.666667,0.0,0.0,0.0,0.25
3gL18eXylqutlzqb6TmB0w,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4mpSNvmyG89Uqy2ahP4JMQ,0.25,0.25,0.25,0.0,1.0,0.25,0.25,0.0,0.0,0.25,...,0.0,0.25,0.0,0.25,0.25,0.166667,0.0,0.0,0.0,0.25


user_id,3FjdHwnoz-vfw6hBBtckqg,rCWrxuRC8_pfagpchtHp6A,9YIEYRTM1-esrPtfTVU4LA,_pLmakEC1U8UPpxCUk2ZAg,ZnIaEvJwrxlBUjimIpx2Qw,135DbbQnr3BEkQbBzZ9T1A,QGgWWhEi5R4SLAKN-xwtNQ,H5d_nFqzwrREE-YduK2ABg,9p_vj2TD8G4kYsHT48z05g,8AwcaBJjiMpQ__FPxktwwQ,...,NvJkV0wW-99rtSMflkkBtw,Jnnzne2sDt1WLB74WfJNvg,IE8-TRdT-Osv8w9fiHDZhQ,SW_GcV2C7--_VpKNeL3mhA,OrbH-f7wtNJk0O6wXkvqoA,lL9Sppyk-mzNGJ5Uloz5-g,fM8rythOmJGL-IRsiYdZqg,IjiXC9ekXkP1I9kdLWogEw,01vXmAJKQ41PVi2wgYDiPA,391zaCfgqkE_4WM3Up-z9Q
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dJ0R-XT78LUQeNHQkD-G9g,,,,,,,,,,,...,,,,,,,,,,
3gL18eXylqutlzqb6TmB0w,,,,,,,,,,,...,,,,,,,,,,
Q_0eGl-aElqHKukHvmLdwA,,,,,,4.0,,,,,...,,,,,,,,,,
Eu_zPTrNVAXkpdSxf7CJ2w,,,,,,,,,,,...,,,,,,,,,,
Yjf0i2J9q52dYIT8UVGT3g,,,,,,,,,5.0,,...,,,,,,,,,,


Unnamed: 0,user_id,business_id,stars,predicted stars
0,hXydWH25S92HjI5hmWRSyA,dJ0R-XT78LUQeNHQkD-G9g,5.0,5.0
1,vo6vLeHoPl_h-Vt-YHs9_A,Q_0eGl-aElqHKukHvmLdwA,1.0,1.0
2,i1qyYL4fpAel8Ljt4WaZ3g,Q_0eGl-aElqHKukHvmLdwA,1.0,1.0
3,dUOg3fS3RTYDZUpu0CqqiA,729grSa1Wsn-hfv7D5uOxg,5.0,5.0
4,k2Bsnh6CV0HFX8RgGR1p9A,Q_0eGl-aElqHKukHvmLdwA,5.0,5.0


(None, None, None)

In [10]:
'2lcK3d4K7FU6O8wXdWzOmA' in [data[i]['business_id'] for i in range(len(data)-1)]

False

In [11]:
tsting[tsting['stars'] != tsting['predicted stars']]


Unnamed: 0,user_id,business_id,stars,predicted stars
15,v2hPERrqGGxsfmT293q4mA,Q_0eGl-aElqHKukHvmLdwA,5.0,4.840000
18,v2hPERrqGGxsfmT293q4mA,729grSa1Wsn-hfv7D5uOxg,5.0,4.860724
20,wFFLWorAWh3hpxVLQP8YWw,729grSa1Wsn-hfv7D5uOxg,5.0,4.714286
21,21D8GYYY-NptvXhBb9x08Q,729grSa1Wsn-hfv7D5uOxg,5.0,4.879397
22,21D8GYYY-NptvXhBb9x08Q,Q_0eGl-aElqHKukHvmLdwA,4.0,4.344262
25,LIHlbBXL5LsOyNpQVAU3lg,729grSa1Wsn-hfv7D5uOxg,5.0,4.571429
26,1gJw6DxTrf0Zmp9gDgYlrQ,Eu_zPTrNVAXkpdSxf7CJ2w,5.0,4.647059
28,LIHlbBXL5LsOyNpQVAU3lg,Eu_zPTrNVAXkpdSxf7CJ2w,5.0,4.800000
34,W40B1zLVNT8dnNZQt90mpw,729grSa1Wsn-hfv7D5uOxg,4.0,3.973684
38,Y8AacNK1oloBnkTQ3CLlEA,Eu_zPTrNVAXkpdSxf7CJ2w,4.0,4.320000


In [12]:
# returns de hoogste value van 2e item van lijst met lijsten
def max_star_bid(inputlist):
    cpy = inputlist
    mx = max([sublist[-1] for sublist in cpy])
    for x in cpy:
        if x[-1] != mx:
            del cpy[cpy.index(x)]
    return cpy

In [80]:
# maak similarity matrix
def maak_sim_matrix():
    tlist = [[data[i]['business_id'], data[i]['categories'].split(', ')[x]] for i in range(len(data))  if data[i]['categories'] != None for x in range(len(data[i]['categories'].split(', ')))]
    catpd = pd.DataFrame(data=tlist, columns=['business_id', 'categories'])
    aye = catpd.pivot_table(index = 'business_id', columns = 'categories', aggfunc = 'size', fill_value=0)
    sim_matrix = create_similarity_matrix_categories(aye)
    return sim_matrix

In [117]:
def recommend(user_id=None, business_id=None, city=None, n=None):
    # similarity matrix maken
    sim_matrix = maak_sim_matrix()
    
    # maak een lijst van alle businesses waar de user is geweest
    already_been = [x['business_id'] for x in rdata if x['user_id'] == user_id]
    
    # als er een user_id wordt meegegeven maak persoonlijke aanbevelingen
    if user_id:
        # lijst met alle ratings die de user heeft gegeven
        ratinglist = [[x['business_id'], x['stars']] for x in rdata if x['user_id'] == user_id]
        
        # als de gebruiker geen reviews heeft geplaatst
        if not ratinglist:
            rand10 = random.sample([x['business_id'] for x in data], n)
            finlist = [{"business_id" : y['business_id'], "stars": y['stars'], "name": y['name'], "city" : y['city'], "address" : y['address']} for y in data for x in rand10 if y['business_id'] == x]
            return finlist
        
        # als de gebruiker wel reviews heeft geplaatst
        else:    
            # de hoogste rating
            max_list = max_star_bid(ratinglist)

            # random business met de hoogste rating
            random_business = random.choice(max_list)

            # series met alle similarities met random business
            top10 = sim_matrix[random_business[0]]

            # haal alle business_ids eruit waar de user is geweest 
            top10 = top10.drop(labels = already_been)

            # pak de top10 daarvan
            reclist = top10.nlargest(n+1).index.tolist()[1:]

            # maak de final list met dicts
            finlist = [{"business_id" : y['business_id'], "stars": y['stars'], "name": y['name'], "city" : y['city'], "address" : y['address']} for y in data for x in reclist if y['business_id'] == x]
            return finlist
    
    # als er geen user_id wordt meegegeven maar wel business_id
    elif business_id:       
        # series met alle similarities met business_id
        top10 = sim_matrix[business_id]
        
        # haal alle business_ids eruit waar de user is geweest
        top10 = top10.drop(labels = already_been)
        
        # pak de top10 daarvan
        reclist = top10.nlargest(n+1).index.tolist()[1:]
        
        # maak de final list met dicts
        finlist = [{"business_id" : y['business_id'], "stars": y['stars'], "name": y['name'], "city" : y['city'], "address" : y['address']} for y in data for x in reclist if y['business_id'] == x]
        return finlist
    
    # als er geen user_id en geen business_id wordt meegegeven pak 10 random business in stad
    else:
        rand10 = random.sample([x['business_id'] for x in data], n)
        finlist = [{"business_id" : y['business_id'], "stars": y['stars'], "name": y['name'], "city" : y['city'], "address" : y['address']} for y in data for x in rand10 if y['business_id'] == x]
        return finlist

In [120]:
recommend(user_id =None, business_id= '7xRbOtZUuw7DxOWrHeaanw', city = None, n=10)

[{'business_id': 'Q_0eGl-aElqHKukHvmLdwA',
  'stars': 4.0,
  'name': "Nelia's Smokehouse",
  'city': 'Ambridge',
  'address': '603 Duss Ave'},
 {'business_id': 'Eu_zPTrNVAXkpdSxf7CJ2w',
  'stars': 4.5,
  'name': 'K & N Restaurant',
  'city': 'Ambridge',
  'address': '755 Merchant St'},
 {'business_id': '729grSa1Wsn-hfv7D5uOxg',
  'stars': 4.5,
  'name': 'Pizza House',
  'city': 'Ambridge',
  'address': '1007 Merchant St'},
 {'business_id': '2_7lYF6P2cYKnSiPhhVyPQ',
  'stars': 3.0,
  'name': 'Orbit Inn',
  'city': 'Ambridge',
  'address': '1327 Merchant St'},
 {'business_id': '0EyRe-VkW8gYxxZ7NCQXeQ',
  'stars': 4.5,
  'name': "Fox's Pizza Den",
  'city': 'Ambridge',
  'address': '1204 Merchant St'},
 {'business_id': 'tTDxa8OzmxUkpVifQ-cEWw',
  'stars': 3.0,
  'name': "Ni's Wok",
  'city': 'Ambridge',
  'address': '703 Merchant St'},
 {'business_id': 'zBeUDwWx73QTZ34A1l0adQ',
  'stars': 2.0,
  'name': 'Vocelli Pizza',
  'city': 'Ambridge',
  'address': '447 Merchant St'},
 {'business_id

In [119]:
for x in data:
    print(x['business_id'])

dJ0R-XT78LUQeNHQkD-G9g
3gL18eXylqutlzqb6TmB0w
Q_0eGl-aElqHKukHvmLdwA
Eu_zPTrNVAXkpdSxf7CJ2w
Yjf0i2J9q52dYIT8UVGT3g
y3IVqEFHmrkgVKj2x1Ci4w
729grSa1Wsn-hfv7D5uOxg
rdHO0LkiNe6s3716hPuQXQ
iJhb_2JL1uIbIRYUl41uVg
NHhzHVKVizOvA61AyN-dSw
ftx72aqADrNYmOlMytdtCg
RdEb3U0CDIXElVJe3w5n6g
7xRbOtZUuw7DxOWrHeaanw
2_7lYF6P2cYKnSiPhhVyPQ
EZ9zSk4ld27LwgfANoF4VA
0EyRe-VkW8gYxxZ7NCQXeQ
4mpSNvmyG89Uqy2ahP4JMQ
tTDxa8OzmxUkpVifQ-cEWw
pcbVl6ZHDOJHsnetY0rJEQ
N92Pbr2ygKDLkjmr-4BAPw
gOu6hTevtDeTBNMOjo6fsw
zBeUDwWx73QTZ34A1l0adQ
EOmRHPvzR88a5R2j2uD0cQ
TsusMt8MhyWaQOVL-MLnjA
LHd0Y_0tIWWPvRqDb2b74Q
lawYwEXAE-Sq2nf6co7aBg
yyGzYDh0Qa2o8vUGMEjDRA
KQrgpqBmIsBbdAn6CnfgCA
c_3u2TfMWtt8OlRFYtVo_Q
t_EiW3FlMnFTHyoeVU79xg
-InU2nAbC9AuS-Um2Cowgw
KLptkOv3OiNSTU4unZv9Sg
GI1WxFbY9tJ9-ChRrwnrzg
muFJIZKZwbAfy_pEFKF_pw
XJfvPt-8f-6d5Foaz_HYLQ
xM8dVGLkYaL94EuAIkjMEA
