In [174]:
import json
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
bizPATH = 'data/yelp_dataset/yelp_academic_dataset_business.json'
revPATH = 'data/yelp_dataset/yelp_academic_dataset_review.json'

In [169]:
class Business:
    
    def __init__(self, business_id, name, address, coordinate, stars, review_count, attributes, categories):
        self.business_id = business_id
        self.name = name
        self.address = address
        self.coordinate = coordinate
        self.stars = stars
        self.review_count = review_count
        self.attributes = attributes
        self.categories = categories
        
    def __repr__(self):
        return (
            f"Address: {self.address}\nCoordinate: {self.coordinate}\nStars: {self.stars}\nCategories: {self.categories}"
        )
    
    
def BusinessDecoder(obj):
    
    address = f"{obj['address']} {obj['city']} {obj['state']} {obj['postal_code']}"
    coordinate = (obj['latitude'], obj['longitude'])
    
    return Business(obj['business_id'], obj['name'], address, coordinate, obj['stars'], obj['review_count'], 
                    obj['attributes'], obj['categories'])

In [166]:
b1 = BusinessDecoder(obj)

In [167]:
b1.address

'10913 Bailey Rd Cornelius NC 28031'

In [168]:
print(b1)

Address: 10913 Bailey Rd Cornelius NC 28031
Coordinate: (35.4627242, -80.8526119)
Stars: 3.5
Categories: Active Life, Gun/Rifle Ranges, Guns & Ammo, Shopping


In [170]:
# first extracting all the biz id within 'bar' category and save it into a dict
# because dict is implemented in hashtable with mostly O(1) look up time


def bizCatFilter(path, cat = 'bar'):

    bizID = {}
    nameMapper = {}

    with open(path, 'r') as infile:
        for line in infile:
            
            data = json.loads(line)
            
            if data['categories']:
                if cat.lower() in data['categories'].lower():
                    if data['review_count'] >= 10: 
                        
                        nameMapper[data['name']] = data['business_id']
                        bizID[data['business_id']] = BusinessDecoder(data)
                        
    return bizID, nameMapper

In [123]:
type(json.loads(json.dumps(data)))

dict

In [171]:
bizID, nameMapper = bizCatFilter(bizPATH, cat = 'bar') 

In [280]:
path = revPATH
# stop = 100000

reviews = list()
with open(path, 'r') as infile:
    for i, line in enumerate(infile):
#         if i%(stop)==0:
#             print(i)
#         if i == stop:
#             break
            
        data = json.loads(line)
        
        if(data['business_id'] in bizID):
            
#             review_id = data['review_id']
            user_id = data['user_id']
            business_id = data['business_id']
            stars = data['stars']
            date = data['date']
#             text = data['text']
#             useful = data['useful']
#             funny = data['funny']
#             cool = data['cool']
            
            reviews.append([user_id, business_id, stars, date])
            #[review_id, user_id, business_id, stars, date, text, useful, funny, cool])
            

df = pd.DataFrame(reviews, columns = ['user_id', 'business_id', 'stars', 'date'])
#['review_id', 'user_id', 'business_id', 'stars', 'date', 'text', 'useful', 'funny', 'cool'])

In [281]:
df.date = pd.to_datetime(df.date)
df = df[df['date'] > '2016']
df = df.drop(columns='date')

In [282]:
df = df.drop_duplicates(subset=['business_id', 'user_id'], keep='last')

In [283]:
treshold = 10
df = df.groupby('user_id').filter(lambda x : len(x) >= treshold)
df = df.groupby('business_id').filter(lambda x : len(x) >= treshold)

In [284]:
revs = df.pivot(
    index = 'business_id',
    columns = 'user_id',
    values = 'stars'
).fillna(0)

In [285]:
#scipy sparse matrix used to avoid improve efficiency
matMapper = {v:k for k, v in enumerate(revs.index)}
mat_revs = csr_matrix(revs.values)

In [286]:
revs.shape

(6197, 12680)

In [109]:
(revs != 0).sum()

user_id
--2HUmLkcNHZp0xw6AMBPg    17
--BumyUHiO_7YsHurb9Hkw    12
--Nnm_506G_p8MxAOQna5w    13
--Qh8yKWAvIP4V4K8ZPfHA    13
--YhjyV-ce1nFLYxP49C5A    16
                          ..
zyebSPCZLUZHapi-dSHU5Q    20
zyh_AzbO1JNnhywem3hUPg    10
zz25_Dsf99KnwTpdl4-PDQ    10
zzmhLxcZ4XZQyz95c_KbOA    15
zzo--VpSQh8PpsGVeMC1dQ    14
Length: 10421, dtype: int64

In [114]:
revs

user_id,--2HUmLkcNHZp0xw6AMBPg,--BumyUHiO_7YsHurb9Hkw,--Nnm_506G_p8MxAOQna5w,--Qh8yKWAvIP4V4K8ZPfHA,--YhjyV-ce1nFLYxP49C5A,--ZNfWKj1VyVElRx6-g1fg,--cd_gA-9Q8gM9P2cTxEsQ,-0-hVEpwWEcJLJoGq3rE3g,-0Xu57zrI3Rxi8wGZsnWKQ,-0b84SUGVN0YkG5j2MCmBw,...,zx3y74_pvIRuQSVIrgzCew,zxRHyxQm-32j5Z7Pi7bHCA,zxmIKyEX89aY0QLC1JJksA,zy4A7504SezncCAcotMv4g,zyMB50UtmEuQf-1VUKxZxQ,zyebSPCZLUZHapi-dSHU5Q,zyh_AzbO1JNnhywem3hUPg,zz25_Dsf99KnwTpdl4-PDQ,zzmhLxcZ4XZQyz95c_KbOA,zzo--VpSQh8PpsGVeMC1dQ
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--FBCX-N37CMYDfs790Bnw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--cjBEbXMI2obtaRHNSFrA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-01XupAWZEXbdNbxNg5mEg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0NrB58jqKqJfuUCDupcsw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0WegMt6Cy966qlDKhu6jA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zxsktAMsVBKj2PvvCV11UQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyPGYeXF4XKCqNN1pjFWhg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyw5DjrRks7a8OhmBsgCQQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzUj3ej4vm_DtvRxNvWDEw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# CF K-NN model

In [110]:
from sklearn.neighbors import NearestNeighbors

In [287]:
# making model class
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

# fit the sparse matrix
knn.fit(mat_revs)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [276]:
def make_recommendation(model, data,  nameMapper, matMapper, bizID, fav_bar, n_recommendations):
    
    #fit
    model.fit(data)
    
    print(f"You favorite bar is: {fav_bar}.")
    print(bizID[nameMapper[fav_bar]])
    print('>'*10)
    
    bar_ID = nameMapper[fav_bar]
    
    print('Recommendation system starts to make inference')
    print('......\n')
    distances, indices = model.kneighbors(data[matMapper[bar_ID]], n_neighbors = n_recommendations+1)
    
    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())),
                           key=lambda x:x[1])[:0:-1]
    
    #get reverse mappper
    reverse_mapper = {v: k for k, v in matMapper.items()}
    
    #print recommendations
    print(f'Recommendations for customers at {fav_bar}:')
    for i, (idx, dist) in enumerate(raw_recommends):
        match = bizID[reverse_mapper[idx]]
        print(f'{i+1} : {match.name, dist}')
        print(match)
        print('='*20)

In [288]:
# np.random.seed(20)
idx = revs.index[np.random.randint(revs.shape[0])]
name = bizID[idx].name
print(name)

Tarbell's The Tavern


In [289]:
bizID[idx]

Address: 3209 E Camelback Rd Phoenix AZ 85018
Coordinate: (33.5091229429, -112.0126287829)
Stars: 4.0
Categories: Cocktail Bars, American (Traditional), Bars, Nightlife, Wine Bars, Lounges, American (New), Mexican, Restaurants

In [290]:
make_recommendation(
    model = knn, 
    data = mat_revs, 
    nameMapper = nameMapper, 
    matMapper = matMapper,
    bizID = bizID, 
    fav_bar = name, 
    n_recommendations = 10)

You favorite bar is: Tarbell's The Tavern.
Address: 3209 E Camelback Rd Phoenix AZ 85018
Coordinate: (33.5091229429, -112.0126287829)
Stars: 4.0
Categories: Cocktail Bars, American (Traditional), Bars, Nightlife, Wine Bars, Lounges, American (New), Mexican, Restaurants
>>>>>>>>>>
Recommendation system starts to make inference
......

Recommendations for customers at Tarbell's The Tavern:
1 : ('Hillstone Restaurant', 0.8839822506321126)
Address: 2650 E Camelback Rd Phoenix AZ 85016
Coordinate: (33.5101434, -112.0243027)
Stars: 4.0
Categories: Nightlife, Restaurants, Sushi Bars, Jazz & Blues, Arts & Entertainment, Steakhouses, American (New)
2 : ('Mora Italian', 0.8838927214306364)
Address: 5651 N 7th St Phoenix AZ 85014
Coordinate: (33.5198524, -112.064637)
Stars: 4.0
Categories: Bars, Nightlife, Pizza, Salad, Restaurants, Italian
3 : ('CEDAR ROOM Fine Cigars & Lounge', 0.8821757045026125)
Address: 20715 N Pima Rd, Ste F100 Scottsdale AZ 85255
Coordinate: (33.6738829, -111.8881123)
Star

In [83]:
df.reset_index(drop=True, inplace = True)

In [21]:
df.user_id.nunique()

13588

In [22]:
df.business_id.nunique()

13033

In [45]:
df = df.set_index('business_id').drop_duplicates(subset)

In [64]:
(df.user_id.value_counts() < 10).sum()

0

In [68]:
df['business_id'] = df.index

In [76]:
(df.drop_duplicates(subset=['business_id', 'user_id'], keep='last').business_id.value_counts() > 10).sum()

5904

In [42]:
(df[df.duplicated(subset=['business_id', 'user_id'])].groupby(['user_id', 'business_id'])['stars'].count() > 1).sum()

1666