In [1]:
import time
import json
import re
import os 
import tensorflow as tf
import pandas as pd
import numpy as np
from itertools import compress
import matplotlib.pyplot as plt

In [2]:
def json_to_csv(directory, fileNames, createSample=False):
    """
    json_to_csv: loops through specified JSON files and converts them to csv files.
                 option to also create a sample csv, which uses np.random.seed 9001 to create a sample dataset with 10% of the observations
    
                 pandas has a read_json function, but returns a 'Trailing data error' when working with these specific files
                 
                 
    Inputs: -directory of JSON files
            -list of JSON filenames
            -createSample flag
            
    """
    
    start = time.time()

    jsonData = []

    for fileName in fileNames:
        with open(directory + fileName,  encoding="utf8") as file:
            print('{0} opened'.format(fileName))
            for line in file:
                #I use an rstrip here because some of the files have trailing blank spaces
                jsonData.append(json.loads(line.rstrip()))
        
        df = pd.DataFrame.from_dict(jsonData)
        
        csvFileName = fileName[:len(fileName)-5] + '.csv'
        
        df.to_csv(directory + csvFileName)
        print('{0} created'.format(csvFileName))
        
        
        if createSample:
            np.random.seed(9001)
            msk = np.random.rand(len(df)) <= 0.1
            sample = df[msk]
            
            csvSampleFileName = fileName[:len(fileName)-5] + '_sample.csv'
            
            sample.to_csv(directory + csvSampleFileName)
            print('{0} created'.format(csvSampleFileName))
        
    print('This function took {} minutes to run'.format((time.time()-start)/60))
    

In [3]:
# fileNameList = ['user.json',
#                 'business.json',
#                 'review.json']

# json_to_csv('data/', fileNameList, createSample=True)

In [4]:
df_business = pd.read_json('data/business.json', lines=True)
df_business.dropna(inplace=True, subset = ['categories'], axis=0)
df_business.loc[df_business['categories'].str.contains('Restaurants')]
df_business['categories'].value_counts()

Restaurants, Pizza                                                                                                                                                                              1042
Nail Salons, Beauty & Spas                                                                                                                                                                      1031
Pizza, Restaurants                                                                                                                                                                               993
Beauty & Spas, Nail Salons                                                                                                                                                                       947
Food, Coffee & Tea                                                                                                                                                                               888
Mexican, Restau

In [5]:
df_business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,2818 E Camino Acequia Drive,{'GoodForKids': 'False'},1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC
3,"15655 W Roosevelt St, Ste 237",,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC


In [6]:
df_business.count()

address         192127
attributes      163773
business_id     192127
categories      192127
city            192127
hours           147778
is_open         192127
latitude        192127
longitude       192127
name            192127
postal_code     192127
review_count    192127
stars           192127
state           192127
dtype: int64

In [32]:
# reading filter_review 
df_review = pd.read_csv('data/filtered_reviews.csv', index_col=0)
df_review.dropna

<bound method DataFrame.dropna of                     business_id  stars                 user_id
0        ujmEBvifdJM6h6RLv4wQIg    1.0  hG7b0MtEbXx5QzbzE6C_VA
2        WTqjgwHlXbSFevF32_DJVw    5.0  n6-Gk65cPZL6Uz8qRm3NYw
3        ikCg8xy5JIg_NGPx-MSIDA    5.0  dacAIZ6fTM6mqwW5uxkskg
6        3fw2X5bZYeW9xCz_zGhOHg    3.0  jlu4CztcSxrKx56ba1a5AQ
7        zvO-PJCpNk4fgAVUnExYAA    1.0  d6xvYpyzcfbF_AZ8vMB7QA
8        b2jN2mm9Wf3RcrZCgfo1cg    2.0  sG_h0dIzTKWa3Q6fmb4u-g
9        oxwGyA17NL6c5t1Etg5WgQ    3.0  nMeCE5-xsdleyxYuNZ_7rA
10       8mIrX_LrOnAqWsB5JrOojQ    4.0  FIk4lQQu1eTe2EpzQ4xhBA
12       FxLfqxdYPA6Z85PFKaqLrg    4.0  GYNnVehQeXjty0xH7-6Fhw
14       AakkkTuGZA2KBodKi2_u8A    1.0  TpyOT5E16YASd7EWjLQlrw
15       YvrylyuWgbP90RgMqZQVnQ    5.0  NJlxGtouq06hhC7sS2ECYw
16       NyLYY8q1-H3hfsTwuwLPCg    4.0  86J5DwcFk4f4In1Vxe2TvA
17       cHdJXLlKNWixBXpDwEGb_A    3.0  JSrP-dUmLlwZiI7Dp3PQ2A
18       6lj2BJ4tJeu7db5asGHQ4w    5.0  6Fz_nus_OG4gar721OKgZA
19       y-Iw6dZflNix

In [33]:
# inumerating business_in and user_id with bid and uid 
def build_fmap_invmap(ser):
    uni_ele = ser.unique()
    fmap = {v:i for i, v in enumerate(uni_ele)}
    invmap = {i:v for i, v in enumerate(uni_ele)}
    return fmap, invmap

In [34]:
# setting debuging enviroment on (dbg =1) to turn it off (dbg = 0)
dbg = 1
if dbg:
    df_review = df_review.head(100000)

In [35]:
bus_fmap, bus_invmap = build_fmap_invmap(df_review['business_id'])
u_fmap, u_invmap = build_fmap_invmap(df_review['user_id'])

In [36]:
df_review['bid'] = df_review['business_id'].map(bus_fmap)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [37]:
df_review['uid'] = df_review['user_id'].map(u_fmap)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
df_review.head()

Unnamed: 0,business_id,stars,user_id,bid,uid
0,ujmEBvifdJM6h6RLv4wQIg,1.0,hG7b0MtEbXx5QzbzE6C_VA,0,0
2,WTqjgwHlXbSFevF32_DJVw,5.0,n6-Gk65cPZL6Uz8qRm3NYw,1,1
3,ikCg8xy5JIg_NGPx-MSIDA,5.0,dacAIZ6fTM6mqwW5uxkskg,2,2
6,3fw2X5bZYeW9xCz_zGhOHg,3.0,jlu4CztcSxrKx56ba1a5AQ,3,3
7,zvO-PJCpNk4fgAVUnExYAA,1.0,d6xvYpyzcfbF_AZ8vMB7QA,4,4


In [39]:
n_users, n_bus = df_review['uid'].nunique(), df_review['bid'].nunique()

In [40]:
n_dim = 5

In [41]:
# Initializing tensor flow at a randon number | n_users * n_dim (layers) initializing at some random number between
# -1 to 1 both for business and users. For internal layer. 
# PS > Create a function with code below: 

user_vector_raw = tf.Variable(tf.random_uniform([n_users, n_dim], minval = -1., maxval = 1.))
bus_vector_raw = tf.Variable(tf.random_uniform([n_bus, n_dim], minval = -1., maxval = 1.))

# running the tanh function to find 
user_vector = tf.tanh(user_vector_raw)
bus_vector = tf.tanh(bus_vector_raw)

# Stipulating the imput layer. 
users = tf.placeholder(tf.int32, shape=(None))
businesses = tf.placeholder(tf.int32, shape=(None))
ratings = tf.placeholder(tf.float32, shape=(None))

UserSampled = tf.nn.embedding_lookup(user_vector, users)
BusinessSampled = tf.nn.embedding_lookup(bus_vector, businesses)
UserSampled.set_shape([None, n_dim])
BusinessSampled.set_shape([None, n_dim])

# input tensors for products, users, ratings

In [42]:
# Defining the output
# transfer into a fucntion
estimatedaffinitiesraw = tf.reduce_sum(UserSampled * BusinessSampled, 1)
estimatedaffinities = tf.sigmoid(estimatedaffinitiesraw)*5

In [43]:
# estimatedaffinities - ratings ask Lee to clarify ratings, where that ratings comes from? ask to explain the loss function 
# transfer into a function 
loss = tf.reduce_sum(tf.square(estimatedaffinities - ratings))
opt = tf.train.RMSPropOptimizer(learning_rate=.1).minimize(loss)

In [44]:
# Setting the session and intialize it 

sess = tf.Session()

In [45]:
# picking up 64 randon rows in order to run under memory capacity
rows = np.random.choice(df_review.shape[0], 64)

In [46]:
sess.run(tf.global_variables_initializer())

In [47]:
# Creating a loop to train under 64 random rows 
for i in range(10000):
    rows = np.random.choice(df_review.shape[0], 64)
    dfrows = df_review.iloc[rows]
    fd = {users:dfrows['uid'].values,
         businesses:dfrows['bid'].values,
         ratings:dfrows['stars'].values}
    _, l2loss = sess.run([opt, loss], fd)
    if i % 1000 == 0:
        print(l2loss)

297.09717
145.57057
112.433044
106.03503
56.06781
97.63747
53.982788
30.757208
35.38064
22.160257


In [48]:
user_values, bus_values = sess.run([user_vector, bus_vector])

In [49]:
bus_vec_df = pd.DataFrame(data = bus_values, index = 
                          [bus_invmap[i] for i in range(n_bus)])



In [50]:
bus_vec_df

Unnamed: 0,0,1,2,3,4
ujmEBvifdJM6h6RLv4wQIg,-0.611548,-0.630207,0.999998,-0.999997,0.999999
WTqjgwHlXbSFevF32_DJVw,0.999100,0.061038,-0.588499,-0.227964,-0.999544
ikCg8xy5JIg_NGPx-MSIDA,0.624818,-0.992256,0.972830,0.974758,0.956552
3fw2X5bZYeW9xCz_zGhOHg,0.319261,0.924857,0.681026,-0.925611,0.162160
zvO-PJCpNk4fgAVUnExYAA,0.981950,-0.313555,-0.174376,-0.775823,-0.936538
b2jN2mm9Wf3RcrZCgfo1cg,0.999949,0.999313,0.998870,0.998292,0.807768
oxwGyA17NL6c5t1Etg5WgQ,1.000000,-0.779925,0.383969,0.999836,0.867550
8mIrX_LrOnAqWsB5JrOojQ,-0.999998,-1.000000,0.048966,1.000000,-1.000000
FxLfqxdYPA6Z85PFKaqLrg,0.155298,-0.868616,-0.921915,-0.172627,0.146536
AakkkTuGZA2KBodKi2_u8A,0.521653,-0.802679,0.764271,-0.999562,-0.295074


In [51]:
# Joining df_business + bus_vec_df
df_allBusiness = df_business.join(bus_vec_df, on='business_id', how='right')

In [56]:
df_allBusiness = df_allBusiness.dropna()

In [57]:
# Pulling user ID 4 and comparing to inverse map on uid (not sure why, maybe to check accuracy?)
uid = 4
u_invmap[uid]


'd6xvYpyzcfbF_AZ8vMB7QA'

In [58]:
bname = 'cHdJXLlKNWixBXpDwEGb_A'
bid = bus_fmap[bname]

In [53]:
df_allBusiness

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state,0,1,2,3,4
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1.0,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128.0,2.5,ON,-0.243087,0.329378,-0.692765,0.136390,-0.999883
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1.0,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170.0,4.0,NC,-0.999965,0.999997,0.999998,-0.093614,0.999991
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1.0,35.190012,-80.887223,Queen City Plumbing,28217,4.0,4.0,NC,-0.596824,0.302547,0.520837,0.367589,0.634690
7,"4545 E Tropicana Rd Ste 8, Tropicana","{'RestaurantsPriceRange2': '3', 'GoodForKids':...",gbQN7vr_caG_A1ugSmGhWg,"Hair Salons, Hair Stylists, Barbers, Men's Hai...",Las Vegas,"{'Monday': '10:0-19:0', 'Tuesday': '10:0-19:0'...",1.0,36.099872,-115.074574,Supercuts,89121,3.0,3.5,NV,-0.998160,-0.994602,0.993799,-0.082151,0.974395
9,21689 Lorain Rd,"{'ByAppointmentOnly': 'False', 'BusinessAccept...",4GBVPIYRvzGh4K4TkRQ_rw,"Beauty & Spas, Nail Salons, Day Spas, Massage",Fairview Park,"{'Tuesday': '9:0-21:0', 'Wednesday': '9:0-21:0...",1.0,41.440825,-81.854097,Options Salon & Spa,44126,8.0,4.5,OH,0.037647,0.802792,0.433683,-0.833566,0.582374
11,2450 E Indian School Rd,"{'RestaurantsTakeOut': 'True', 'BusinessParkin...",1Dfx3zM-rW4n-31KeC8sJg,"Restaurants, Breakfast & Brunch, Mexican, Taco...",Phoenix,"{'Monday': '7:0-0:0', 'Tuesday': '7:0-0:0', 'W...",1.0,33.495194,-112.028588,Taco Bell,85016,18.0,3.0,AZ,0.670933,-0.071793,0.076168,-0.341265,0.326735
12,"119 Landings Dr, Ste 101","{'BusinessParking': '{'garage': False, 'street...",5t3KVdMnFgAYmSl1wYLhmA,"Bars, Nightlife, Pubs, Barbers, Beauty & Spas,...",Mooresville,"{'Monday': '10:0-1:0', 'Tuesday': '10:0-1:0', ...",1.0,35.527410,-80.868003,The Kilted Buffalo Langtree,28117,9.0,3.5,NC,0.986491,0.954114,0.995568,-0.993592,-0.985890
13,5981 Andrews Rd,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...",fweCYi8FmbJXHCqLnwuk8w,"Italian, Restaurants, Pizza, Chicken Wings",Mentor-on-the-Lake,"{'Monday': '10:0-0:0', 'Tuesday': '10:0-0:0', ...",1.0,41.708520,-81.359556,Marco's Pizza,44060,16.0,4.0,OH,-0.973523,-0.986569,-0.958119,-0.933494,-0.613278
14,4145 Erie St,"{'RestaurantsTakeOut': 'True', 'BusinessParkin...",-K4gAv8_vjx8-2BxkVeRkA,"Bakeries, Food",Willoughby,"{'Tuesday': '11:0-17:0', 'Wednesday': '11:0-17...",1.0,41.639860,-81.406396,Baby Cakes,44094,7.0,3.0,OH,-0.314476,-0.659896,0.195082,0.119009,-0.620532
18,,{'BusinessAcceptsCreditCards': 'True'},nh_kQ16QAoXWwqZ05MPfBQ,"Event Planning & Services, Photographers, Prof...",Las Vegas,"{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1.0,36.116549,-115.088115,Myron Hensel Photography,89121,21.0,5.0,NV,-0.999815,-0.999885,0.999867,-0.999669,0.999955


In [54]:
df_allBusiness.count()

address         9895
attributes      9895
business_id     9895
categories      9895
city            9895
hours           9895
is_open         9895
latitude        9895
longitude       9895
name            9895
postal_code     9895
review_count    9895
stars           9895
state           9895
0               9895
1               9895
2               9895
3               9895
4               9895
dtype: int64

In [60]:
df_allBusiness.loc[df_allBusiness['categories'].str.contains('Restaurant') & 
           df_allBusiness['categories'].str.contains('Japanese')]

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state,0,1,2,3,4
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1.0,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170.0,4.0,NC,-0.999965,0.999997,0.999998,-0.093614,0.999991
73,436 Market St,"{'OutdoorSeating': 'False', 'HasTV': 'True', '...",v-scZMU6jhnmV955RSzGJw,"Japanese, Sushi Bars, Restaurants",Pittsburgh,"{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'...",1.0,40.441062,-80.002126,No. 1 Sushi Sushi,15222,106.0,4.5,PA,0.999688,0.990525,0.995713,0.990283,-0.999986
80,529-17th Avenue SW,"{'WiFi': ''free'', 'GoodForDancing': 'False', ...",PkDghu4aan2_wxrhXjTEgg,"Nightlife, Italian, Restaurants, Japanese, Lou...",Calgary,"{'Tuesday': '14:0-23:0', 'Wednesday': '14:0-23...",0.0,51.037778,-114.073351,MiraKuru,T2S 0A9,16.0,3.5,AB,0.851759,-0.739656,-0.455840,-0.356938,-0.442973
185,2945 Lake Shore Boulevard,"{'RestaurantsDelivery': 'True', 'HasTV': 'True...",SJBzyJDCR_f6dx5tpYAABA,"Sushi Bars, Japanese, Restaurants",Toronto,"{'Monday': '16:0-22:0', 'Tuesday': '11:30-22:0...",1.0,43.600523,-79.505516,Kibo Sushi House,M8V 1J5,15.0,4.0,ON,0.995017,-0.050886,0.996406,-0.364973,0.992534
343,10624 S Eastern Ave,"{'RestaurantsTakeOut': 'True', 'NoiseLevel': '...",jX9DocoiY4Bo9EUkaTSqvg,"Restaurants, Hawaiian, Chinese, Japanese, Poke...",Henderson,"{'Monday': '0:0-0:0', 'Tuesday': '11:0-19:0', ...",1.0,35.998220,-115.102246,China AAA,89052,149.0,4.5,NV,0.999357,0.984174,-0.999998,0.997881,-0.999999
346,1909 E Ray Rd,"{'RestaurantsDelivery': 'False', 'RestaurantsT...",ecJri9ozyke4dOCWulZiRQ,"Asian Fusion, Japanese, Restaurants, Ramen, Ta...",Chandler,"{'Monday': '17:0-21:30', 'Tuesday': '17:0-21:3...",1.0,33.320006,-111.809675,Nishikawa Ramen,85225,427.0,4.0,AZ,-0.999871,-0.999999,1.000000,0.999999,0.939268
444,3339 Boulevard des Sources,"{'RestaurantsDelivery': 'False', 'RestaurantsA...",4B8VnRAstRRshxiUzm9yPw,"Restaurants, Sushi Bars, Japanese",Dollard-des-Ormeaux,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-23:0'...",1.0,45.484315,-73.795652,Maiko Sushi - DDO,H9B 1Z8,51.0,4.0,QC,-0.999904,-0.999847,-0.999840,0.999445,-0.998962
507,"6989 N Hayden Rd, Ste A12","{'RestaurantsAttire': ''casual'', 'OutdoorSeat...",QS3QxI7u5PRdtbGgI0-UsA,"Asian Fusion, Restaurants, Seafood, Sushi Bars...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '17:30-21:0',...",1.0,33.538029,-111.905676,Sakana Sushi & Grill,85250,347.0,4.0,AZ,0.999998,0.999984,-0.999996,-0.845985,1.000000
524,338-8338 18 Street SE,"{'GoodForMeal': '{'dessert': False, 'latenight...",bPBZEDuHbE-I7bxUWIYMhQ,"Japanese, Restaurants, Sushi Bars",Calgary,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1.0,50.977178,-114.013257,Oishii Sushi,T2C 4E4,78.0,3.5,AB,0.996360,-0.792342,0.899589,0.300949,-0.113288
577,1418 Rue Cartier,"{'RestaurantsPriceRange2': '3', 'OutdoorSeatin...",9ELnhtgMF8_h8Zky4A7BSA,"Restaurants, Japanese",Montréal,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",0.0,45.523226,-73.551979,Saiken Sushi,H2K 4C8,4.0,3.0,QC,-0.448920,0.399883,-0.611769,0.893831,-0.314365


# Testing | Passing Train 

In [61]:
bid

12

In [62]:
chipotle = bus_values[bid]

In [63]:
chipotle

array([ 0.9997464 , -0.31959277, -1.        , -0.99998826, -0.9075526 ],
      dtype=float32)

In [64]:
japaneselover = user_values[uid]

In [65]:
bus_values

array([[-0.6115483 , -0.63020724,  0.9999982 , -0.9999971 ,  0.99999905],
       [ 0.99910015,  0.06103822, -0.5884991 , -0.22796388, -0.99954355],
       [ 0.6248183 , -0.9922559 ,  0.97283036,  0.9747577 ,  0.9565522 ],
       ...,
       [-0.73892975, -0.63080984,  0.7855097 , -0.25191945,  0.1082809 ],
       [ 0.99561906,  0.99345046, -0.995815  ,  0.9964896 ,  0.99650115],
       [ 0.3404476 ,  0.86906964, -0.11090262,  0.20922092,  0.76553243]],
      dtype=float32)

In [66]:
np.square(bus_values - japaneselover[None,:]).sum(1).argsort()

array([12564,  7053,  9572, ...,   920,  3937,  3289])

In [67]:
np.square(bus_values - chipotle[None,:]).sum(1).argsort()

array([   12, 12315,  6907, ...,  2784,  3616,  1605])

In [70]:
def closest_businesses_to(business = None, user = None, df = None):
    if business is not None:
        target = bus_values[bus_fmap[business]]
    if user is not None:
        target = user_values[u_fmap[user]]
    if df is None:
        df = bus_values
    best_restaurants = np.square(df - target[None,:]).sum(1).argsort()
    return best_restaurants

In [71]:
midtown_japanese_restaurants = bus_values[:30,:]

In [72]:
closest_businesses_to(business = 'cHdJXLlKNWixBXpDwEGb_A')


array([   12, 12315,  6907, ...,  2784,  3616,  1605])

In [73]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ')

array([10102,  5017, 10504, ...,  7433,  1237,  1805])

In [75]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ', df = midtown_japanese_restaurants)

array([24,  8, 14,  0, 10, 26, 18, 25, 23, 27,  9, 13, 21, 19, 15, 16, 12,
        4,  3, 28, 11, 29,  6,  7,  1,  2, 20, 17,  5, 22])

# Dropping NaN 

In [144]:
df_userSample = pd.read_csv('data/user_sample.csv')
df_userSample = df_userSample.dropna()

In [145]:
df_userSample.count()

Unnamed: 0            7229
average_stars         7229
compliment_cool       7229
compliment_cute       7229
compliment_funny      7229
compliment_hot        7229
compliment_list       7229
compliment_more       7229
compliment_note       7229
compliment_photos     7229
compliment_plain      7229
compliment_profile    7229
compliment_writer     7229
cool                  7229
elite                 7229
fans                  7229
friends               7229
funny                 7229
name                  7229
review_count          7229
useful                7229
user_id               7229
yelping_since         7229
dtype: int64

In [146]:
def find_ftres_with_nan(df):
    all_nan = df.columns[df.isnull().all()].tolist()
    some_nan = df.columns[df.isnull().any()].tolist()
    print("All NaN Features: ", len(all_nan), all_nan, "Some NaN Features: ", len(some_nan), some_nan)
    return all_nan, some_nan

In [147]:
business = pd.read_csv('data/business.csv',encoding = "ISO-8859-1",index_col=0)
all_nan, some_nan = find_ftres_with_nan(business)

All NaN Features:  0 [] Some NaN Features:  33 ['address', 'attributes', 'average_stars', 'business_id', 'categories', 'city', 'compliment_cool', 'compliment_cute', 'compliment_funny', 'compliment_hot', 'compliment_list', 'compliment_more', 'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_writer', 'cool', 'elite', 'fans', 'friends', 'funny', 'hours', 'is_open', 'latitude', 'longitude', 'name', 'postal_code', 'stars', 'state', 'useful', 'user_id', 'yelping_since']


### Number of businesses that have both "food" and "restaurant" in their category:

In [148]:

# create a mask for restaurants

mask_restaurants = business['categories'].str.contains('Restaurants')

# create a mask for food
mask_food = business['categories'].str.contains('Food')

# apply both masks
restaurants_and_food = business[mask_restaurants & mask_food]

# number of businesses that have food and restaurant in their category
restaurants_and_food['categories'].count()

21311

### Even after taking buisnesses that have both food and restaurant in their categories, there are still irrelevant business categories in the data.

In [149]:
# an example row
restaurants_and_food.head(1)['categories'].values

array(['Specialty Food, Restaurants, Dim Sum, Imported Food, Food, Chinese, Ethnic Food, Seafood'],
      dtype=object)

### Thus, we manually identified additional categories that needed to be excluded specifically.

In [150]:
categoryDF = restaurants_and_food['categories'].apply(lambda x: x[1:-1].split(',')).apply(pd.Series)
uniqueCategories = pd.DataFrame(categoryDF.stack().str.strip().unique())

In [151]:
categoriesToRemove = ['Grocery','Drugstores','Convenience Stores','Beauty & Spas','Photography Stores & Services',
                      'Cosmetics & Beauty Supply','Discount Store','Fashion','Department Stores','Gas Stations',
                      'Automotive','Music & Video','Event Planning & Services','Mobile Phones','Health & Medical',
                      'Weight Loss Centers','Home & Garden','Kitchen & Bath','Jewelry',"Children's Clothing",
                      'Accessories','Home Decor','Bus Tours','Auto Glass Services','Auto Detailing',
                      'Oil Change Stations', 'Auto Repair','Body Shops','Car Window Tinting','Car Wash',
                      'Gluten-Free','Fitness & Instruction','Nurseries & Gardening','Wedding Planning',
                      'Embroidery & Crochet','Dance Schools','Performing Arts',
                      'Wholesale Stores','Tobacco Shops','Nutritionists','Hobby Shops','Pet Services',
                      'Electronics','Plumbing','Gyms','Yoga','Walking Tours','Toy Stores','Pet Stores',
                      'Pet Groomers','Vape Shops','Head Shops',
                      'Souvenir Shops','Pharmacy','Appliances & Repair','Wholesalers','Party Equipment Rentals',
                      'Tattoo','Funeral Services & Cemeteries','Sporting Goods','Dog Walkers',
                      'Pet Boarding/Pet Sitting','Scavenger Hunts','Contractors','Trainers', 
                      'Customized Merchandise', 'Dry Cleaning & Laundry', 'Art Galleries'
                      'Tax Law', 'Bankruptcy Law', 'Tax Services', 'Estate Planning Law', 
                      'Business Consulting', 'Lawyers', 'Pet Adoption', 'Escape Games', 
                      'Animal Shelters', 'Commercial Real Estate', 'Real Estate Agents', 
                      'Real Estate Services', 'Home Inspectors']

In [152]:

restaurants_df = restaurants_and_food[~restaurants_and_food['categories'].str.contains('|'.join(categoriesToRemove))]

In [153]:
restaurants_df.to_csv('data/restaurants.csv')
restaurants_df = pd.read_csv('data/restaurants.csv', encoding='ISO-8859-1', index_col=0)
restaurants_df = restaurants_df.dropna(axis=1)

In [154]:
restaurants_df.head()

Unnamed: 0,business_id,categories,city,is_open,latitude,longitude,name,review_count,stars,state
1637139,QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,1.0,43.605499,-79.652289,Emerald Chinese Restaurant,128,2.5,ON
1637149,1Dfx3zM-rW4n-31KeC8sJg,"Restaurants, Breakfast & Brunch, Mexican, Taco...",Phoenix,1.0,33.495194,-112.028588,Taco Bell,18,3.0,AZ
1637161,1RHY4K3BD22FK7Cfftn8Mg,"Sandwiches, Salad, Restaurants, Burgers, Comfo...",Pittsburgh,1.0,40.496177,-80.246011,Marathon Diner,35,4.0,PA
1637163,tstimHoMcYbkSC4eBA1wEg,"Mexican, Restaurants, Patisserie/Cake Shop, Fo...",Las Vegas,1.0,36.195615,-115.040529,Maria's Mexican Restaurant & Bakery,184,4.5,NV
1637167,NDuUMJfrWk52RA-H-OtrpA,"Juice Bars & Smoothies, Food, Restaurants, Fas...",Toronto,1.0,43.642889,-79.425429,Bolt Fresh Bar,57,3.0,ON


# Building 

In [155]:
#task 1
def get_restaurants(keyword):
    return restaurants_df

In [156]:
get_restaurants('Japanese')

Unnamed: 0,business_id,categories,city,is_open,latitude,longitude,name,review_count,stars,state
1637139,QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,1.0,43.605499,-79.652289,Emerald Chinese Restaurant,128,2.5,ON
1637149,1Dfx3zM-rW4n-31KeC8sJg,"Restaurants, Breakfast & Brunch, Mexican, Taco...",Phoenix,1.0,33.495194,-112.028588,Taco Bell,18,3.0,AZ
1637161,1RHY4K3BD22FK7Cfftn8Mg,"Sandwiches, Salad, Restaurants, Burgers, Comfo...",Pittsburgh,1.0,40.496177,-80.246011,Marathon Diner,35,4.0,PA
1637163,tstimHoMcYbkSC4eBA1wEg,"Mexican, Restaurants, Patisserie/Cake Shop, Fo...",Las Vegas,1.0,36.195615,-115.040529,Maria's Mexican Restaurant & Bakery,184,4.5,NV
1637167,NDuUMJfrWk52RA-H-OtrpA,"Juice Bars & Smoothies, Food, Restaurants, Fas...",Toronto,1.0,43.642889,-79.425429,Bolt Fresh Bar,57,3.0,ON
1637170,SP_YXIEwkFPPl_9anCYmpQ,"Restaurants, Nightlife, Breakfast & Brunch, Ve...",Toronto,0.0,43.660494,-79.432099,The Steady Cafe & Bar,29,3.5,ON
1637196,_J_x_RaYTqAqAuCwgRhnRQ,"Coffee & Tea, Hookah Bars, Nightlife, Persian/...",Charlotte,0.0,35.172028,-80.746801,Kabob House,15,3.0,NC
1637213,kANF0dbeoW34s2vwh6Umfw,"Fast Food, Food, Restaurants, Ice Cream & Froz...",Las Vegas,0.0,36.125031,-115.225620,Dairy Queen,33,2.0,NV
1637219,eXgDqKD7-sojyzh6q7PJ4g,"Sandwiches, Desserts, Custom Cakes, Bakeries, ...",Etobicoke,1.0,43.637329,-79.531166,The Sweet Gallery,20,4.0,ON
1637253,qJeSjOMgWB3er3UXG33ZVw,"American (Traditional), Breakfast & Brunch, Re...",Calgary,1.0,51.008460,-114.068754,Carl's Jr,6,3.5,AB


In [157]:
#task 2
def get_reviews_for(rest_id):
    return df_reviewSample.loc[df_reviewSample['business_id']==rest_id]

In [158]:
get_reviews_for('19fdSca3MUoaGFNX2BrjTQ')


Unnamed: 0.1,Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
68133,679434,19fdSca3MUoaGFNX2BrjTQ,0,2018-06-09 14:13:04,0,bDhAERq8suVl40xdk00zKQ,3.0,I was super excited to go here but was slightl...,0,6aklT92xV0_duyUI9l84Pw
80631,804232,19fdSca3MUoaGFNX2BrjTQ,0,2018-05-10 18:09:25,0,TVzDDSiC6qQ2A2mjdwwV8w,5.0,This is exactly what this area needs! You want...,0,96s7b2PBjmkzEeQTzmKp7w
81639,814150,19fdSca3MUoaGFNX2BrjTQ,1,2018-01-13 15:48:11,0,MWyp6vwBVbReDbJAYQUCpw,5.0,I happened upon this place randomly and just H...,1,uBHRgjD2xy77JCVm7CwmOA
100279,1000397,19fdSca3MUoaGFNX2BrjTQ,0,2018-08-30 01:30:04,0,FgHiEau2OaB2sDkw4p15Dw,5.0,"Amazing! Quality ingredients, all sauces are h...",1,wlTd-6M1pkO7zs5CgopQOQ
100896,1006603,19fdSca3MUoaGFNX2BrjTQ,0,2018-07-12 19:27:38,0,3HhyFikgJUW-SXWnp-dUtQ,5.0,Incredible. I stopped in today and I'm blown a...,0,oOB27OMd4k5oPTayEPClag
120537,1201834,19fdSca3MUoaGFNX2BrjTQ,0,2018-01-30 19:27:07,0,tmT-drziC2y64jfXH3HTow,5.0,I just stopped in today as I was on my way bac...,0,YYKuS5-8NGiAPNMwX9nO7g
124197,1237683,19fdSca3MUoaGFNX2BrjTQ,0,2018-04-13 02:29:18,0,rWufRpJ9HWivqaa_5EW6uA,5.0,Stopped in hereafter a workout session in the ...,1,7GIcGERUfVvOx_TNYomGcA
131198,1308059,19fdSca3MUoaGFNX2BrjTQ,1,2018-01-06 21:55:16,0,NOe4hvLsJqHHkwE8brfrPQ,5.0,Not one negative thing to say! Service was qui...,3,pz69IIgBrek4B__VV1-HIA


In [163]:
#task 3
def get_recommendations_for(user_id = None, business_id = None):
    if user_id is not None:
        bids = closest_businesses_to(user = user_id)
    else:
        bids = closest_businesses_to(business = business_id)
    bnames = [bus_invmap[b] for b in bids]
    return restaurants_df.set_index('business_id').loc[bnames]

In [165]:
get_recommendations_for(business_id = 'a7mTbEi2N8Zd-r-8jlReww')

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0_level_0,categories,city,is_open,latitude,longitude,name,review_count,stars,state
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
a7mTbEi2N8Zd-r-8jlReww,"Japanese, Korean, Restaurants, Barbeque, Fast ...",Toronto,0.0,43.775210,-79.257088,Koryo Korean Barbeque,7.0,3.5,ON
U5dh8ojrtLAqe9zSKmGkjA,,,,,,,,,
2LOpxmxw3N4crrNgfsHSjQ,,,,,,,,,
srrPo0aEa8AI8JkuRCBPHA,,,,,,,,,
E2PUvccjQc8fmmPQ-bFg_w,,,,,,,,,
6G2KEkAgkAgSpIb4U8DgNw,,,,,,,,,
Mr7gYkqLDqolt-NAMIwv_g,,,,,,,,,
XZeq9huhB9gdvSKtQ8rWmw,,,,,,,,,
X7_fyzTdFgUvG5GrZKOCxw,,,,,,,,,
3d8q85yG4Pqq_aqNamW_Iw,,,,,,,,,


In [166]:
get_recommendations_for(user_id= '96s7b2PBjmkzEeQTzmKp7w')

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0_level_0,categories,city,is_open,latitude,longitude,name,review_count,stars,state
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
t53w4JlkxTau-G-TzhSWog,,,,,,,,,
PQuPLNkdc6_iSBqlBoxWMQ,,,,,,,,,
Ufa7MJ3yDDIMFkYctnhT8Q,"Mexican, Restaurants, Food",Las Vegas,0.0,36.239527,-115.226293,Qdoba Mexican Grill,7.0,3.0,NV
EXlr1HjgQ7pa8zhjCL_E8w,,,,,,,,,
i7lFu1-iadoXW5Hn-JWaeg,,,,,,,,,
1j8Ic7I5E9ztBe1lMZ_06w,"Restaurants, Sandwiches, Fast Food",Toronto,1.0,43.663407,-79.418557,Subway,5.0,2.5,ON
6qa_KEqzJ9rqm70qqRly0w,,,,,,,,,
6d2hdz6IuFRDYJkYo9vR0g,,,,,,,,,
LnXWM2it80DhwejXr6CDjw,,,,,,,,,
fin8bCrfZ7aCB1pGesvDmw,,,,,,,,,


### Building Geo Table 

In [172]:
restaurantsGeo_df = restaurants_df.drop(['is_open', 'review_count'], 1)

In [173]:
restaurantsGeo_df.head()

Unnamed: 0,business_id,categories,city,latitude,longitude,name,stars,state
1637139,QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,43.605499,-79.652289,Emerald Chinese Restaurant,2.5,ON
1637149,1Dfx3zM-rW4n-31KeC8sJg,"Restaurants, Breakfast & Brunch, Mexican, Taco...",Phoenix,33.495194,-112.028588,Taco Bell,3.0,AZ
1637161,1RHY4K3BD22FK7Cfftn8Mg,"Sandwiches, Salad, Restaurants, Burgers, Comfo...",Pittsburgh,40.496177,-80.246011,Marathon Diner,4.0,PA
1637163,tstimHoMcYbkSC4eBA1wEg,"Mexican, Restaurants, Patisserie/Cake Shop, Fo...",Las Vegas,36.195615,-115.040529,Maria's Mexican Restaurant & Bakery,4.5,NV
1637167,NDuUMJfrWk52RA-H-OtrpA,"Juice Bars & Smoothies, Food, Restaurants, Fas...",Toronto,43.642889,-79.425429,Bolt Fresh Bar,3.0,ON
