In [2]:
import time
import json
import re
import os 
import tensorflow as tf
import pandas as pd
import numpy as np
from itertools import compress
import matplotlib.pyplot as plt


In [3]:
def json_to_csv(directory, fileNames, createSample=False):
    """
    json_to_csv: loops through specified JSON files and converts them to csv files.
                 option to also create a sample csv, which uses np.random.seed 9001 to create a sample dataset with 10% of the observations
    
                 pandas has a read_json function, but returns a 'Trailing data error' when working with these specific files
                 
                 
    Inputs: -directory of JSON files
            -list of JSON filenames
            -createSample flag
            
    """
    
    start = time.time()

    jsonData = []

    for fileName in fileNames:
        with open(directory + fileName,  encoding="utf8") as file:
            print('{0} opened'.format(fileName))
            for line in file:
                #I use an rstrip here because some of the files have trailing blank spaces
                jsonData.append(json.loads(line.rstrip()))
        
        df = pd.DataFrame.from_dict(jsonData)
        
        csvFileName = fileName[:len(fileName)-5] + '.csv'
        
        df.to_csv(directory + csvFileName)
        print('{0} created'.format(csvFileName))
        
        
        if createSample:
            np.random.seed(9001)
            msk = np.random.rand(len(df)) <= 0.1
            sample = df[msk]
            
            csvSampleFileName = fileName[:len(fileName)-5] + '_sample.csv'
            
            sample.to_csv(directory + csvSampleFileName)
            print('{0} created'.format(csvSampleFileName))
        
    print('This function took {} minutes to run'.format((time.time()-start)/60))
    

In [4]:
df_business = pd.read_json('data/business.json', lines=True)

# df_business.dropna(inplace=True, subset = ['categories'], axis=0)

# df_business.loc[df_business['categories'].str.contains('Restaurants')]

# df_business['categories'].value_counts()

In [5]:
df_business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,2818 E Camino Acequia Drive,{'GoodForKids': 'False'},1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC
3,"15655 W Roosevelt St, Ste 237",,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC


In [6]:
#df_user = pd.read_csv('data/user.csv', nrows = 100)

# df_review =  pd.read_csv('data/review.csv', usecols = ['business_id', 'user_id', 'stars'])
# user_cnts = df_review['user_id'].value_counts()
# top_users = user_cnts.loc[user_cnts>2].index
# df_review = df_review.loc[df_review['user_id'].isin(top_users)]
# df_review.to_csv('data/filtered_reviews.csv')

In [7]:
# reading filterdreview 

df_review = pd.read_csv('data/filtered_reviews.csv', index_col=0)

  mask |= (ar1 == a)


In [8]:
# inumerating business_in and user_id with bid and uid 
def build_fmap_invmap(ser):
    uni_ele = ser.unique()
    fmap = {v:i for i, v in enumerate(uni_ele)}
    invmap = {i:v for i, v in enumerate(uni_ele)}
    return fmap, invmap

In [9]:
# setting debuging enviroment on (dbg =1) to turn it off (dbg = 0)
dbg = 1
if dbg:
    df_review = df_review.head(100000)

In [10]:
bus_fmap, bus_invmap = build_fmap_invmap(df_review['business_id'])
u_fmap, u_invmap = build_fmap_invmap(df_review['user_id'])

In [11]:
df_review['bid'] = df_review['business_id'].map(bus_fmap)

In [12]:
df_review['uid'] = df_review['user_id'].map(u_fmap)

In [13]:
df_review.head()

Unnamed: 0,business_id,stars,user_id,bid,uid
0,ujmEBvifdJM6h6RLv4wQIg,1.0,hG7b0MtEbXx5QzbzE6C_VA,0,0
2,WTqjgwHlXbSFevF32_DJVw,5.0,n6-Gk65cPZL6Uz8qRm3NYw,1,1
3,ikCg8xy5JIg_NGPx-MSIDA,5.0,dacAIZ6fTM6mqwW5uxkskg,2,2
6,3fw2X5bZYeW9xCz_zGhOHg,3.0,jlu4CztcSxrKx56ba1a5AQ,3,3
7,zvO-PJCpNk4fgAVUnExYAA,1.0,d6xvYpyzcfbF_AZ8vMB7QA,4,4


In [14]:
n_users, n_bus = df_review['uid'].nunique(), df_review['bid'].nunique()

In [15]:
n_dim = 5

In [16]:
# Initializing tensor flow at a randon number | n_users * n_dim (layers) initializing at some random number between
# -1 to 1 both for business and users. For internal layer. 
# PS > Create a function with code below: 

user_vector_raw = tf.Variable(tf.random_uniform([n_users, n_dim], minval = -1., maxval = 1.))
bus_vector_raw = tf.Variable(tf.random_uniform([n_bus, n_dim], minval = -1., maxval = 1.))

# running the tanh function to find 
user_vector = tf.tanh(user_vector_raw)
bus_vector = tf.tanh(bus_vector_raw)

# Stipulating the imput layer. 
users = tf.placeholder(tf.int32, shape=(None))
businesses = tf.placeholder(tf.int32, shape=(None))
ratings = tf.placeholder(tf.float32, shape=(None))

UserSampled = tf.nn.embedding_lookup(user_vector, users)
BusinessSampled = tf.nn.embedding_lookup(bus_vector, businesses)
UserSampled.set_shape([None, n_dim])
BusinessSampled.set_shape([None, n_dim])

# input tensors for products, users, ratings

Instructions for updating:
Colocations handled automatically by placer.


In [17]:
# Defining the output
# transfer into a fucntion
estimatedaffinitiesraw = tf.reduce_sum(UserSampled * BusinessSampled, 1)
estimatedaffinities = tf.sigmoid(estimatedaffinitiesraw)*5

In [18]:
# estimatedaffinities - ratings ask Lee to clarify ratings, where that ratings comes from? ask to explain the loss function 
# transfer into a function 
loss = tf.reduce_sum(tf.square(estimatedaffinities - ratings))
opt = tf.train.RMSPropOptimizer(learning_rate=.1).minimize(loss)

Instructions for updating:
Use tf.cast instead.


In [19]:
# Setting the session and intialize it 

sess = tf.Session()


In [20]:
# picking up 64 randon rows in order to run under memory capacity
rows = np.random.choice(df_review.shape[0], 64)

In [21]:
sess.run(tf.global_variables_initializer())

In [22]:
# Creating a loop to train under 64 random rows 
for i in range(10000):
    rows = np.random.choice(df_review.shape[0], 64)
    dfrows = df_review.iloc[rows]
    fd = {users:dfrows['uid'].values,
         businesses:dfrows['bid'].values,
         ratings:dfrows['stars'].values}
    _, l2loss = sess.run([opt, loss], fd)
    if i % 1000 == 0:
        print(l2loss)

228.34375
196.62096
129.46735
127.56539
62.1024
92.022675
63.691776
67.92952
44.007023
30.19474


In [23]:
user_values, bus_values = sess.run([user_vector, bus_vector])

In [24]:
bus_vec_df = pd.DataFrame(data = bus_values, index = 
                          [bus_invmap[i] for i in range(n_bus)])



In [25]:
bus_vec_df

Unnamed: 0,0,1,2,3,4
ujmEBvifdJM6h6RLv4wQIg,0.999999,0.939678,0.999999,0.999996,0.421627
WTqjgwHlXbSFevF32_DJVw,-0.846114,-0.983510,0.999340,-0.999994,-0.998755
ikCg8xy5JIg_NGPx-MSIDA,0.991453,0.998031,0.820526,-0.996142,-0.725511
3fw2X5bZYeW9xCz_zGhOHg,-0.060154,-0.462281,-0.474131,-0.891558,0.803159
zvO-PJCpNk4fgAVUnExYAA,0.623059,0.518949,-0.653268,0.720762,0.564523
b2jN2mm9Wf3RcrZCgfo1cg,0.974992,-0.713141,-0.999624,-0.996452,-0.999995
oxwGyA17NL6c5t1Etg5WgQ,0.999998,0.800680,0.283163,0.624772,-0.207765
8mIrX_LrOnAqWsB5JrOojQ,-1.000000,-1.000000,1.000000,-0.163437,-1.000000
FxLfqxdYPA6Z85PFKaqLrg,-0.051394,0.675002,0.243848,0.584288,-0.356887
AakkkTuGZA2KBodKi2_u8A,0.961140,0.331234,0.961127,-0.429804,0.735228


In [26]:
# Joining df_business + bus_vec_df
df_all = df_business.join(bus_vec_df, on='business_id', how='right')


In [27]:
# Dropping NaN columns 
df_all = df_all.dropna()

In [28]:
# Pulling user ID 4 and comparing to inverse map on uid (not sure why, maybe to check accuracy?)
uid = 4
u_invmap[uid]  

'd6xvYpyzcfbF_AZ8vMB7QA'

In [29]:
bname = 'cHdJXLlKNWixBXpDwEGb_A'
bid = bus_fmap[bname]

In [30]:
df_all.loc[df_all['categories'].str.contains('Restaurant') & 
           df_all['categories'].str.contains('Japanese')]

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state,0,1,2,3,4
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,-0.999814,0.999504,0.997142,0.999139,-0.999999
73,436 Market St,"{'OutdoorSeating': 'False', 'HasTV': 'True', '...",v-scZMU6jhnmV955RSzGJw,"Japanese, Sushi Bars, Restaurants",Pittsburgh,"{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'...",1,40.441062,-80.002126,No. 1 Sushi Sushi,15222,106,4.5,PA,-0.999631,-0.999927,0.999002,-0.999869,0.999999
80,529-17th Avenue SW,"{'WiFi': ''free'', 'GoodForDancing': 'False', ...",PkDghu4aan2_wxrhXjTEgg,"Nightlife, Italian, Restaurants, Japanese, Lou...",Calgary,"{'Tuesday': '14:0-23:0', 'Wednesday': '14:0-23...",0,51.037778,-114.073351,MiraKuru,T2S 0A9,16,3.5,AB,0.283822,0.257769,0.384145,0.186970,0.725683
185,2945 Lake Shore Boulevard,"{'RestaurantsDelivery': 'True', 'HasTV': 'True...",SJBzyJDCR_f6dx5tpYAABA,"Sushi Bars, Japanese, Restaurants",Toronto,"{'Monday': '16:0-22:0', 'Tuesday': '11:30-22:0...",1,43.600523,-79.505516,Kibo Sushi House,M8V 1J5,15,4.0,ON,-0.980898,0.999363,-0.998411,-0.997496,0.983104
343,10624 S Eastern Ave,"{'RestaurantsTakeOut': 'True', 'NoiseLevel': '...",jX9DocoiY4Bo9EUkaTSqvg,"Restaurants, Hawaiian, Chinese, Japanese, Poke...",Henderson,"{'Monday': '0:0-0:0', 'Tuesday': '11:0-19:0', ...",1,35.998220,-115.102246,China AAA,89052,149,4.5,NV,-0.999990,-0.999936,0.995477,-0.999978,-0.999998
346,1909 E Ray Rd,"{'RestaurantsDelivery': 'False', 'RestaurantsT...",ecJri9ozyke4dOCWulZiRQ,"Asian Fusion, Japanese, Restaurants, Ramen, Ta...",Chandler,"{'Monday': '17:0-21:30', 'Tuesday': '17:0-21:3...",1,33.320006,-111.809675,Nishikawa Ramen,85225,427,4.0,AZ,0.999996,0.670560,0.999998,0.999999,0.999987
444,3339 Boulevard des Sources,"{'RestaurantsDelivery': 'False', 'RestaurantsA...",4B8VnRAstRRshxiUzm9yPw,"Restaurants, Sushi Bars, Japanese",Dollard-des-Ormeaux,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-23:0'...",1,45.484315,-73.795652,Maiko Sushi - DDO,H9B 1Z8,51,4.0,QC,0.999537,-0.984982,0.999888,0.999951,0.999967
507,"6989 N Hayden Rd, Ste A12","{'RestaurantsAttire': ''casual'', 'OutdoorSeat...",QS3QxI7u5PRdtbGgI0-UsA,"Asian Fusion, Restaurants, Seafood, Sushi Bars...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '17:30-21:0',...",1,33.538029,-111.905676,Sakana Sushi & Grill,85250,347,4.0,AZ,-0.552663,-0.999998,-0.999890,0.999872,0.999999
524,338-8338 18 Street SE,"{'GoodForMeal': '{'dessert': False, 'latenight...",bPBZEDuHbE-I7bxUWIYMhQ,"Japanese, Restaurants, Sushi Bars",Calgary,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,50.977178,-114.013257,Oishii Sushi,T2C 4E4,78,3.5,AB,-0.416461,0.231266,0.992997,-0.774893,0.994082
577,1418 Rue Cartier,"{'RestaurantsPriceRange2': '3', 'OutdoorSeatin...",9ELnhtgMF8_h8Zky4A7BSA,"Restaurants, Japanese",Montréal,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",0,45.523226,-73.551979,Saiken Sushi,H2K 4C8,4,3.0,QC,0.358013,-0.892293,-0.872533,-0.730438,0.213198


### Testing (Passing train) 


In [31]:
bid

12

In [32]:
chipotle = bus_values[bid]

In [33]:
chipotle

array([ 0.99997985, -1.        ,  0.78260916,  0.30094504,  0.99999946],
      dtype=float32)

In [34]:
japaneselover = user_values[uid]

In [35]:
bus_values

array([[ 0.9999991 ,  0.9396781 ,  0.999999  ,  0.99999636,  0.42162743],
       [-0.84611416, -0.9835101 ,  0.9993398 , -0.99999446, -0.99875456],
       [ 0.99145293,  0.9980312 ,  0.8205256 , -0.99614197, -0.72551084],
       ...,
       [ 0.82006025, -0.57698023, -0.5382508 ,  0.7410189 , -0.57418895],
       [-0.9881709 ,  0.9808874 , -0.7451318 , -0.9312196 ,  0.18285851],
       [ 0.8986607 , -0.506382  ,  0.33877787, -0.63493055, -0.65729547]],
      dtype=float32)

In [36]:
np.square(bus_values - japaneselover[None,:]).sum(1).argsort()

array([ 8157, 11393,  1223, ...,  1690,  3838,  1927])

In [37]:
np.square(bus_values - chipotle[None,:]).sum(1).argsort()

array([  12, 5142,   99, ...,  542, 4045, 1969])

In [38]:
def closest_businesses_to(business = None, user = None, df = None):
    if business is not None:
        target = bus_values[bus_fmap[business]]
    if user is not None:
        target = user_values[u_fmap[user]]
    if df is None:
        df = bus_values
    best_restaurants = np.square(df - target[None,:]).sum(1).argsort()
    return best_restaurants

In [39]:
midtown_japanese_restaurants = bus_values[:30,:]

In [40]:
closest_businesses_to(business = 'cHdJXLlKNWixBXpDwEGb_A')


array([  12, 5142,   99, ...,  542, 4045, 1969])

In [41]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ')

array([11129,  8000,  8024, ...,  2192,  2036,  2638])

In [42]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ', df = midtown_japanese_restaurants)

array([ 0,  6,  9,  4,  8, 24, 29, 17, 22, 27, 12, 26,  2, 14, 16, 23, 10,
       20, 18,  3, 15, 28, 21, 11, 13,  7,  5,  1, 19, 25])

In [None]:
#df_user = pd.read_csv('data/user.csv', nrows = 100)

# df_review =  pd.read_csv('data/review.csv', usecols = ['business_id', 'user_id', 'stars'])
# user_cnts = df_review['user_id'].value_counts()
# top_users = user_cnts.loc[user_cnts>2].index
# df_review = df_review.loc[df_review['user_id'].isin(top_users)]
# df_review.to_csv('data/filtered_reviews.csv')

In [43]:
df_reviewSample = pd.read_csv('data/review_sample.csv')

In [44]:
df_reviewSample.head()

Unnamed: 0.1,Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,0,ujmEBvifdJM6h6RLv4wQIg,0,2013-05-07 04:34:36,1,Q1sbwvVQXV2734tPgoKj4Q,1.0,Total bill for this horrible service? Over $8G...,6,hG7b0MtEbXx5QzbzE6C_VA
1,13,LUN6swQYa4xJKaM_UEUOEw,0,2018-04-27 20:25:26,0,qlXw1JQ0UodW7qrmVgwCXw,4.0,Michael from Red Carpet VIP is amazing ! I rea...,0,bAhqAPoWaZYcyYi7bs024Q
2,48,OVTZNSkSfbl3gVB9XQIJfw,0,2010-01-08 04:28:23,0,u1jQX0KfnG3AHty4ifEpFA,4.0,"notes: 1 visit, breakfast/lunch\noverall: Exce...",0,8vIK6ndl8yzIdmSDnGp0tw
3,62,SU56w479vUfFHsvmvQIf7A,6,2016-07-25 03:55:20,5,E4LqIZ7DJd_R4ZHSNKx4RQ,4.0,So good! They didn't make it to 5 stars due to...,7,DoRCeCcJbrsM2BiAKj3trA
4,84,sk0stgY4NDJYOX1MbNJ3Pg,0,2018-09-25 03:58:48,0,SneRds-D0MCMWbIitd0AHg,1.0,At least the Pinks concession stand by Section...,0,WThN-4czFfMs1vYhvaf_9A


In [45]:
df_reviewSample.count()

Unnamed: 0     668330
business_id    668330
cool           668330
date           668330
funny          668330
review_id      668330
stars          668330
text           668330
useful         668330
user_id        668330
dtype: int64

In [46]:
df_userSample = pd.read_csv('data/user_sample.csv')

In [47]:
df_userSample.head()

Unnamed: 0.1,Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,...,cool,elite,fans,friends,funny,name,review_count,useful,user_id,yelping_since
0,0,4.03,1,0,1,2,0,0,1,0,...,25,201520162017,5,"c78V-rj8NQcQjOI8KP3UEA, alRMgPcngYSCJ5naFRBz5g...",17,Rashmi,95,84,l6BmjZMeQD3rDxWUbiAiow,2013-10-08 23:11:33
1,13,3.95,4,0,4,0,0,1,2,1,...,12,,0,"_5p_nO7OczVP7czj_7T-RA, vux89VHowK-AlTpw_0J-wQ...",17,Emily,22,31,US0VOSMFs9U0Nkl5Vx1EzA,2010-01-23 04:55:04
2,48,3.66,822,9,822,480,5,46,419,338,...,9594,201320142015201620172018,231,"SxSeTYznS0YKSISgs5hj_Q, VtnCudEFLBjnxblLUAPYhA...",6997,Regina,1053,11769,d7D4dYzF6THtOx9imf-wPw,2008-12-05 07:16:22
3,62,4.44,0,0,0,0,0,0,0,0,...,1,,2,"xbcwzWSFu_OaYVPHrEV4IA, 1vv8nEe5Sxj1EVwKF6vbrA...",3,Lindsay,16,9,Z_ZkQgFtL2skSyZ9_9NUaQ,2015-10-26 01:06:15
4,84,3.22,0,0,0,0,0,0,0,0,...,4,,0,"ve7sBZHI7y6buFY3KzQ9sA, UAzP14mLa8GkWzVUMoL5hg...",5,Priscilla,16,34,on3cA28Qu-Eh50IuS2iq6w,2013-07-25 02:48:09


In [49]:
def find_ftres_with_nan(df):
    all_nan = df.columns[df.isnull().all()].tolist()
    some_nan = df.columns[df.isnull().any()].tolist()
    print("All NaN Features: ", len(all_nan), all_nan, "Some NaN Features: ", len(some_nan), some_nan)
    return all_nan, some_nan


In [51]:
df_businessSample = pd.read_csv('data/business_sample.csv',encoding = "ISO-8859-1",index_col=0)
all_nan, some_nan = find_ftres_with_nan(business)


All NaN Features:  0 [] Some NaN Features:  33 ['address', 'attributes', 'average_stars', 'business_id', 'categories', 'city', 'compliment_cool', 'compliment_cute', 'compliment_funny', 'compliment_hot', 'compliment_list', 'compliment_more', 'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_writer', 'cool', 'elite', 'fans', 'friends', 'funny', 'hours', 'is_open', 'latitude', 'longitude', 'name', 'postal_code', 'stars', 'state', 'useful', 'user_id', 'yelping_since']


In [53]:
# Number of businesses that have both "food" and "restaurant" in their category:

# create a mask for restaurants
mask_restaurants = df_businessSample['categories'].str.contains('Restaurants')

# create a mask for food
mask_food = df_businessSample['categories'].str.contains('Food')

# apply both masks
restaurants_and_food = df_businessSample[mask_restaurants & mask_food]

# number of businesses that have food and restaurant in their category
restaurants_and_food['categories'].count()


2141

In [76]:
df_reviewSample.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,ujmEBvifdJM6h6RLv4wQIg,0,2013-05-07 04:34:36,1,Q1sbwvVQXV2734tPgoKj4Q,1.0,Total bill for this horrible service? Over $8G...,6,hG7b0MtEbXx5QzbzE6C_VA
13,LUN6swQYa4xJKaM_UEUOEw,0,2018-04-27 20:25:26,0,qlXw1JQ0UodW7qrmVgwCXw,4.0,Michael from Red Carpet VIP is amazing ! I rea...,0,bAhqAPoWaZYcyYi7bs024Q
48,OVTZNSkSfbl3gVB9XQIJfw,0,2010-01-08 04:28:23,0,u1jQX0KfnG3AHty4ifEpFA,4.0,"notes: 1 visit, breakfast/lunch\noverall: Exce...",0,8vIK6ndl8yzIdmSDnGp0tw
62,SU56w479vUfFHsvmvQIf7A,6,2016-07-25 03:55:20,5,E4LqIZ7DJd_R4ZHSNKx4RQ,4.0,So good! They didn't make it to 5 stars due to...,7,DoRCeCcJbrsM2BiAKj3trA
84,sk0stgY4NDJYOX1MbNJ3Pg,0,2018-09-25 03:58:48,0,SneRds-D0MCMWbIitd0AHg,1.0,At least the Pinks concession stand by Section...,0,WThN-4czFfMs1vYhvaf_9A


In [75]:
df_userSample.head()

Unnamed: 0.1,Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,...,cool,elite,fans,friends,funny,name,review_count,useful,user_id,yelping_since
0,0,4.03,1,0,1,2,0,0,1,0,...,25,201520162017,5,"c78V-rj8NQcQjOI8KP3UEA, alRMgPcngYSCJ5naFRBz5g...",17,Rashmi,95,84,l6BmjZMeQD3rDxWUbiAiow,2013-10-08 23:11:33
1,13,3.95,4,0,4,0,0,1,2,1,...,12,,0,"_5p_nO7OczVP7czj_7T-RA, vux89VHowK-AlTpw_0J-wQ...",17,Emily,22,31,US0VOSMFs9U0Nkl5Vx1EzA,2010-01-23 04:55:04
2,48,3.66,822,9,822,480,5,46,419,338,...,9594,201320142015201620172018,231,"SxSeTYznS0YKSISgs5hj_Q, VtnCudEFLBjnxblLUAPYhA...",6997,Regina,1053,11769,d7D4dYzF6THtOx9imf-wPw,2008-12-05 07:16:22
3,62,4.44,0,0,0,0,0,0,0,0,...,1,,2,"xbcwzWSFu_OaYVPHrEV4IA, 1vv8nEe5Sxj1EVwKF6vbrA...",3,Lindsay,16,9,Z_ZkQgFtL2skSyZ9_9NUaQ,2015-10-26 01:06:15
4,84,3.22,0,0,0,0,0,0,0,0,...,4,,0,"ve7sBZHI7y6buFY3KzQ9sA, UAzP14mLa8GkWzVUMoL5hg...",5,Priscilla,16,34,on3cA28Qu-Eh50IuS2iq6w,2013-07-25 02:48:09


In [78]:
#task 1
def get_restaurants(keyword):
    return restaurants_df.loc[restaurants_df['categories'].str.contains(keyword)]

In [95]:

get_restaurants('Japanese')

Unnamed: 0,address,attributes,average_stars,business_id,categories,city,compliment_cool,compliment_cute,compliment_funny,compliment_hot,...,latitude,longitude,name,postal_code,review_count,stars,state,useful,user_id,yelping_since
1638819,"4215 Spring Mountain Rd, Ste B106A",,,bZs7FJ7fHIJxVecfp1qvug,"Ramen, Restaurants, Japanese, Tapas/Small Plat...",Las Vegas,,,,,...,36.125381,-115.19598,Takopa,89102,51,4.0,NV,,,
1639220,300 Borough Drive,"{'BusinessParking': ""{'garage': False, 'street...",,a7mTbEi2N8Zd-r-8jlReww,"Japanese, Korean, Restaurants, Barbeque, Fast ...",Toronto,,,,,...,43.77521,-79.257088,Koryo Korean Barbeque,M1P 4P5,7,3.5,ON,,,
1645072,5209 Boulevard DÃÂ©carie,"{'WheelchairAccessible': 'None', 'BusinessPark...",,5dzXrZr7MyTB92_MSqV3Lw,"Hawaiian, Restaurants, Japanese, Poke, Food",MontrÃÂ©al,,,,,...,45.484839,-73.630869,Le Poke Shop,H3W 3C2,7,3.5,QC,,,
1648686,1101 Rutherford Road,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...",,QIn3nMX3ywY8a2bCpyQcRA,"Food, Japanese, Restaurants, Seafood Markets, ...",Thornhill,,,,,...,43.84486,-79.470476,Aroowha Sushi & Sake Bar,L4J 0E2,90,4.0,ON,,,
1654446,5904 Bryant St,"{'BusinessParking': ""{'garage': False, 'street...",,GDXftPS1_fTJUtZkaEUDhA,"Restaurants, Juice Bars & Smoothies, Food, Jap...",Pittsburgh,,,,,...,40.475584,-79.919675,Plate & Bowl,15206,22,4.0,PA,,,
1661360,7510 Pineville Matthews Rd,"{'GoodForMeal': ""{'dessert': False, 'latenight...",,19fdSca3MUoaGFNX2BrjTQ,"Food, Japanese, Restaurants, Sushi Bars, Poke",Charlotte,,,,,...,35.088049,-80.845964,Umami PokÃÂ©Rito,28226,115,4.5,NC,,,
1677832,2900 Markham Road,"{'RestaurantsReservations': 'True', 'Restauran...",,fB-v-caPf0o3eUkp_2ua2Q,"Restaurants, Food, Seafood Markets, Specialty ...",Toronto,,,,,...,43.82465,-79.24908,Sakana-Ya,M1X 1E6,5,3.5,ON,,,
1684215,417 Rue Saint-Pierre,"{'GoodForMeal': ""{'dessert': False, 'latenight...",,OSXpi4u9RZfWq1FZt5qG0w,"Coffee & Tea, Restaurants, Japanese, Cafes, Food",MontrÃÂ©al,,,,,...,45.501661,-73.556605,Flyjin,H2Y 2M3,55,3.5,QC,,,
1684610,"7780 S Jones Blvd, Ste 103","{'HasTV': 'False', 'RestaurantsReservations': ...",,eR7Iw_X54m3YLWkXp9y35Q,"Hawaiian, Specialty Food, Japanese, Ethnic Foo...",Las Vegas,,,,,...,36.047985,-115.224432,Tonkatsu Kiyoshi,89139,536,4.5,NV,,,
1693914,300 John Street,"{'RestaurantsReservations': 'False', 'Restaura...",,lu7vtrp_bE9PnxWfA8g4Pg,"Restaurants, Fast Food, Food Court, Japanese",Thornhill,,,,,...,43.820492,-79.398466,Banzai Sushi,L3T 5W4,6,4.5,ON,,,


In [96]:
#task 2
def get_reviews_for(rest_id):
    return df_reviewSample.loc[df_reviewSample['business_id']==rest_id]

In [97]:
get_reviews_for('19fdSca3MUoaGFNX2BrjTQ')


Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
679434,19fdSca3MUoaGFNX2BrjTQ,0,2018-06-09 14:13:04,0,bDhAERq8suVl40xdk00zKQ,3.0,I was super excited to go here but was slightl...,0,6aklT92xV0_duyUI9l84Pw
804232,19fdSca3MUoaGFNX2BrjTQ,0,2018-05-10 18:09:25,0,TVzDDSiC6qQ2A2mjdwwV8w,5.0,This is exactly what this area needs! You want...,0,96s7b2PBjmkzEeQTzmKp7w
814150,19fdSca3MUoaGFNX2BrjTQ,1,2018-01-13 15:48:11,0,MWyp6vwBVbReDbJAYQUCpw,5.0,I happened upon this place randomly and just H...,1,uBHRgjD2xy77JCVm7CwmOA
1000397,19fdSca3MUoaGFNX2BrjTQ,0,2018-08-30 01:30:04,0,FgHiEau2OaB2sDkw4p15Dw,5.0,"Amazing! Quality ingredients, all sauces are h...",1,wlTd-6M1pkO7zs5CgopQOQ
1006603,19fdSca3MUoaGFNX2BrjTQ,0,2018-07-12 19:27:38,0,3HhyFikgJUW-SXWnp-dUtQ,5.0,Incredible. I stopped in today and I'm blown a...,0,oOB27OMd4k5oPTayEPClag
1201834,19fdSca3MUoaGFNX2BrjTQ,0,2018-01-30 19:27:07,0,tmT-drziC2y64jfXH3HTow,5.0,I just stopped in today as I was on my way bac...,0,YYKuS5-8NGiAPNMwX9nO7g
1237683,19fdSca3MUoaGFNX2BrjTQ,0,2018-04-13 02:29:18,0,rWufRpJ9HWivqaa_5EW6uA,5.0,Stopped in hereafter a workout session in the ...,1,7GIcGERUfVvOx_TNYomGcA
1308059,19fdSca3MUoaGFNX2BrjTQ,1,2018-01-06 21:55:16,0,NOe4hvLsJqHHkwE8brfrPQ,5.0,Not one negative thing to say! Service was qui...,3,pz69IIgBrek4B__VV1-HIA


In [83]:
#task fun random crap:
#make word cloud for a user

In [None]:
#task 3
def get_recommendations_for(user_id, latlong or location):
    bids = closest_businesses_to(user = user_id)
    bnames = [bus_invmap[b] for b in bids]
    return restaurants_df.set_index('business_id').loc[bnames]

In [None]:
#dropna needs to occur way above

In [110]:
#task 3
def get_recommendations_for(user_id = None, bus_id = None):
    if user_id is not None:
        bids = closest_businesses_to(user = user_id)
    else:
        bids = closest_businesses_to(business = bus_id)
    bnames = [bus_invmap[b] for b in bids]
    return restaurants_df.set_index('business_id').loc[bnames]

In [117]:
get_recommendations_for(bus_id = 'a7mTbEi2N8Zd-r-8jlReww')

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0_level_0,address,attributes,average_stars,categories,city,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,...,latitude,longitude,name,postal_code,review_count,stars,state,useful,user_id,yelping_since
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a7mTbEi2N8Zd-r-8jlReww,300 Borough Drive,"{'BusinessParking': ""{'garage': False, 'street...",,"Japanese, Korean, Restaurants, Barbeque, Fast ...",Toronto,,,,,,...,43.77521,-79.257088,Koryo Korean Barbeque,M1P 4P5,7.0,3.5,ON,,,
_NO1sGYVk4lIib9a2DCtdw,,,,,,,,,,,...,,,,,,,,,,
Rmj0NYF_Id_2rZzocx-Vyw,,,,,,,,,,,...,,,,,,,,,,
Oyo5CWthu2upxxaf69MfIg,,,,,,,,,,,...,,,,,,,,,,
bFlP7uApDamdiVHwX8OwfA,,,,,,,,,,,...,,,,,,,,,,
zcq94VmjgyQ7v3bW_xAfsw,,,,,,,,,,,...,,,,,,,,,,
nSNNl7s0p3hSB3ZLPSVSlA,,,,,,,,,,,...,,,,,,,,,,
ZKJcKRG6fQitU7q_Eq04hg,,,,,,,,,,,...,,,,,,,,,,
IyoklPvMD14zdOxHlX_xHQ,,,,,,,,,,,...,,,,,,,,,,
zWrwVNAvrtVAhcDRWJJMEw,,,,,,,,,,,...,,,,,,,,,,


In [113]:
get_recommendations_for(user_id= '96s7b2PBjmkzEeQTzmKp7w')

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0_level_0,address,attributes,average_stars,categories,city,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,...,latitude,longitude,name,postal_code,review_count,stars,state,useful,user_id,yelping_since
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
yWwIUmeenyAO7nwc7U0U5A,,,,,,,,,,,...,,,,,,,,,,
SfP326rcU5sjwGx_dQP2RQ,,,,,,,,,,,...,,,,,,,,,,
DSCbB5lJLKnuxQKadFWrSw,,,,,,,,,,,...,,,,,,,,,,
TMI7DZ7Bng19adXy32E5UQ,10140 Yonge Street,"{'HasTV': 'False', 'BikeParking': 'True', 'Noi...",,"Canadian (New), Restaurants, Diners, Fast Food",Richmond Hill,,,,,,...,43.874637,-79.438233,3 Coins Open Kitchen,L4C 1T6,32.0,4.0,ON,,,
3FB8FiwImxq5eL2GlBF3EQ,,,,,,,,,,,...,,,,,,,,,,
ZUJkwu_NS4L41AXEbGqHBQ,,,,,,,,,,,...,,,,,,,,,,
Be7Mwq06nf1eNLblo1ekow,,,,,,,,,,,...,,,,,,,,,,
lB2bnoWizz44ubSypZcwTA,,,,,,,,,,,...,,,,,,,,,,
Tq_S8F5B2fSeHqLZghOqQg,,,,,,,,,,,...,,,,,,,,,,
2VDzYl-dSl1lqp2Ucxh3cQ,,,,,,,,,,,...,,,,,,,,,,


In [112]:
get_recommendations_for('96s7b2PBjmkzEeQTzmKp7w')

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0_level_0,address,attributes,average_stars,categories,city,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,...,latitude,longitude,name,postal_code,review_count,stars,state,useful,user_id,yelping_since
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
yWwIUmeenyAO7nwc7U0U5A,,,,,,,,,,,...,,,,,,,,,,
SfP326rcU5sjwGx_dQP2RQ,,,,,,,,,,,...,,,,,,,,,,
DSCbB5lJLKnuxQKadFWrSw,,,,,,,,,,,...,,,,,,,,,,
TMI7DZ7Bng19adXy32E5UQ,10140 Yonge Street,"{'HasTV': 'False', 'BikeParking': 'True', 'Noi...",,"Canadian (New), Restaurants, Diners, Fast Food",Richmond Hill,,,,,,...,43.874637,-79.438233,3 Coins Open Kitchen,L4C 1T6,32.0,4.0,ON,,,
3FB8FiwImxq5eL2GlBF3EQ,,,,,,,,,,,...,,,,,,,,,,
ZUJkwu_NS4L41AXEbGqHBQ,,,,,,,,,,,...,,,,,,,,,,
Be7Mwq06nf1eNLblo1ekow,,,,,,,,,,,...,,,,,,,,,,
lB2bnoWizz44ubSypZcwTA,,,,,,,,,,,...,,,,,,,,,,
Tq_S8F5B2fSeHqLZghOqQg,,,,,,,,,,,...,,,,,,,,,,
2VDzYl-dSl1lqp2Ucxh3cQ,,,,,,,,,,,...,,,,,,,,,,


In [105]:
# Dropping NaN columns 
df_user96 = df_all.dropna()

In [106]:
df_user96.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state,0,1,2,3,4
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON,-0.193446,0.955682,-0.430761,-0.339235,-0.135455
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,-0.999814,0.999504,0.997142,0.999139,-0.999999
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC,0.71132,-0.885359,0.364526,0.424743,-0.867387
7,"4545 E Tropicana Rd Ste 8, Tropicana","{'RestaurantsPriceRange2': '3', 'GoodForKids':...",gbQN7vr_caG_A1ugSmGhWg,"Hair Salons, Hair Stylists, Barbers, Men's Hai...",Las Vegas,"{'Monday': '10:0-19:0', 'Tuesday': '10:0-19:0'...",1,36.099872,-115.074574,Supercuts,89121,3,3.5,NV,-0.046117,0.524608,0.83813,-0.468762,-0.132092
9,21689 Lorain Rd,"{'ByAppointmentOnly': 'False', 'BusinessAccept...",4GBVPIYRvzGh4K4TkRQ_rw,"Beauty & Spas, Nail Salons, Day Spas, Massage",Fairview Park,"{'Tuesday': '9:0-21:0', 'Wednesday': '9:0-21:0...",1,41.440825,-81.854097,Options Salon & Spa,44126,8,4.5,OH,0.302838,0.672846,0.201119,-0.888239,-0.320828


In [109]:
df_user96.count()

address               149
attributes            150
average_stars           0
categories            151
city                  151
compliment_cool         0
compliment_cute         0
compliment_funny        0
compliment_hot          0
compliment_list         0
compliment_more         0
compliment_note         0
compliment_photos       0
compliment_plain        0
compliment_profile      0
compliment_writer       0
cool                    0
elite                   0
fans                    0
friends                 0
funny                   0
hours                 124
is_open               151
latitude              151
longitude             151
name                  151
postal_code           151
review_count          151
stars                 151
state                 151
useful                  0
user_id                 0
yelping_since           0
dtype: int64

In [82]:
get_reviews_for('19fdSca3MUoaGFNX2BrjTQ')

get_restaurants('Japanese')

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
679434,19fdSca3MUoaGFNX2BrjTQ,0,2018-06-09 14:13:04,0,bDhAERq8suVl40xdk00zKQ,3.0,I was super excited to go here but was slightl...,0,6aklT92xV0_duyUI9l84Pw
804232,19fdSca3MUoaGFNX2BrjTQ,0,2018-05-10 18:09:25,0,TVzDDSiC6qQ2A2mjdwwV8w,5.0,This is exactly what this area needs! You want...,0,96s7b2PBjmkzEeQTzmKp7w
814150,19fdSca3MUoaGFNX2BrjTQ,1,2018-01-13 15:48:11,0,MWyp6vwBVbReDbJAYQUCpw,5.0,I happened upon this place randomly and just H...,1,uBHRgjD2xy77JCVm7CwmOA
1000397,19fdSca3MUoaGFNX2BrjTQ,0,2018-08-30 01:30:04,0,FgHiEau2OaB2sDkw4p15Dw,5.0,"Amazing! Quality ingredients, all sauces are h...",1,wlTd-6M1pkO7zs5CgopQOQ
1006603,19fdSca3MUoaGFNX2BrjTQ,0,2018-07-12 19:27:38,0,3HhyFikgJUW-SXWnp-dUtQ,5.0,Incredible. I stopped in today and I'm blown a...,0,oOB27OMd4k5oPTayEPClag
1201834,19fdSca3MUoaGFNX2BrjTQ,0,2018-01-30 19:27:07,0,tmT-drziC2y64jfXH3HTow,5.0,I just stopped in today as I was on my way bac...,0,YYKuS5-8NGiAPNMwX9nO7g
1237683,19fdSca3MUoaGFNX2BrjTQ,0,2018-04-13 02:29:18,0,rWufRpJ9HWivqaa_5EW6uA,5.0,Stopped in hereafter a workout session in the ...,1,7GIcGERUfVvOx_TNYomGcA
1308059,19fdSca3MUoaGFNX2BrjTQ,1,2018-01-06 21:55:16,0,NOe4hvLsJqHHkwE8brfrPQ,5.0,Not one negative thing to say! Service was qui...,3,pz69IIgBrek4B__VV1-HIA


In [74]:
restaurants_df.head()

Unnamed: 0,address,attributes,average_stars,business_id,categories,city,compliment_cool,compliment_cute,compliment_funny,compliment_hot,...,latitude,longitude,name,postal_code,review_count,stars,state,useful,user_id,yelping_since
1637167,1170 Queen Street W,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...",,NDuUMJfrWk52RA-H-OtrpA,"Juice Bars & Smoothies, Food, Restaurants, Fas...",Toronto,,,,,...,43.642889,-79.425429,Bolt Fresh Bar,M6J 1J5,57,3.0,ON,,,
1637311,241 W Charleston Blvd,"{'BusinessParking': ""{'garage': False, 'street...",,v7ZFEEqJBg_r8NCwHRP_2A,"Food, Pizza, Wine Bars, Bars, Restaurants, Nig...",Las Vegas,,,,,...,36.158264,-115.157967,Bistro Divino,89102,3,4.5,NV,,,
1637502,12061 Perry Hwy,"{'RestaurantsTakeOut': 'True', 'GoodForKids': ...",,El6Bum4I-0VWJRgzbtscJQ,"Fast Food, Sandwiches, Restaurants, Food, Food...",Wexford,,,,,...,40.632056,-80.058551,Jimmy John's,15090,11,3.0,PA,,,
1637809,1150 College Street,"{'BusinessParking': ""{'garage': False, 'street...",,4w-q_Wc77JzQZSIehKFTzA,"Juice Bars & Smoothies, Restaurants, Sandwiche...",Toronto,,,,,...,43.652361,-79.433924,Brockton Haunt,M6H 1B6,25,4.0,ON,,,
1638108,541-543 Carnot Rd,"{'BusinessAcceptsCreditCards': 'True', 'GoodFo...",,5G9aIrRiCD4TZCC5j2D39A,"Bagels, Donuts, Coffee & Tea, Breakfast & Brun...",Moon,,,,,...,40.515767,-80.221657,Dunkin' Donuts,15108,8,4.0,PA,,,


In [54]:
# an example row
restaurants_and_food.head(1)['categories'].values

array(['Juice Bars & Smoothies, Food, Restaurants, Fast Food, Vegan'],
      dtype=object)

In [56]:
df_category = restaurants_and_food['categories'].apply(lambda x: x[1:-1].split(',')).apply(pd.Series)
uniqueCategories = pd.DataFrame(df_category.stack().str.strip().unique())

In [57]:
categoriesToRemove = ['Grocery','Drugstores','Convenience Stores','Beauty & Spas','Photography Stores & Services',
                      'Cosmetics & Beauty Supply','Discount Store','Fashion','Department Stores','Gas Stations',
                      'Automotive','Music & Video','Event Planning & Services','Mobile Phones','Health & Medical',
                      'Weight Loss Centers','Home & Garden','Kitchen & Bath','Jewelry',"Children's Clothing",
                      'Accessories','Home Decor','Bus Tours','Auto Glass Services','Auto Detailing',
                      'Oil Change Stations', 'Auto Repair','Body Shops','Car Window Tinting','Car Wash',
                      'Gluten-Free','Fitness & Instruction','Nurseries & Gardening','Wedding Planning',
                      'Embroidery & Crochet','Dance Schools','Performing Arts',
                      'Wholesale Stores','Tobacco Shops','Nutritionists','Hobby Shops','Pet Services',
                      'Electronics','Plumbing','Gyms','Yoga','Walking Tours','Toy Stores','Pet Stores',
                      'Pet Groomers','Vape Shops','Head Shops',
                      'Souvenir Shops','Pharmacy','Appliances & Repair','Wholesalers','Party Equipment Rentals',
                      'Tattoo','Funeral Services & Cemeteries','Sporting Goods','Dog Walkers',
                      'Pet Boarding/Pet Sitting','Scavenger Hunts','Contractors','Trainers', 
                      'Customized Merchandise', 'Dry Cleaning & Laundry', 'Art Galleries'
                      'Tax Law', 'Bankruptcy Law', 'Tax Services', 'Estate Planning Law', 
                      'Business Consulting', 'Lawyers', 'Pet Adoption', 'Escape Games', 
                      'Animal Shelters', 'Commercial Real Estate', 'Real Estate Agents', 
                      'Real Estate Services', 'Home Inspectors']

In [58]:
restaurants_df = restaurants_and_food[~restaurants_and_food['categories'].str.contains('|'.join(categoriesToRemove))]

In [59]:
restaurants_df.to_csv('data/restaurants.csv')
restaurants_df = pd.read_csv('data/restaurants.csv', encoding='ISO-8859-1', index_col=0)
all_nan, some_nan = find_ftres_with_nan(restaurants_df)

All NaN Features:  20 ['average_stars', 'compliment_cool', 'compliment_cute', 'compliment_funny', 'compliment_hot', 'compliment_list', 'compliment_more', 'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_writer', 'cool', 'elite', 'fans', 'friends', 'funny', 'useful', 'user_id', 'yelping_since'] Some NaN Features:  24 ['address', 'attributes', 'average_stars', 'compliment_cool', 'compliment_cute', 'compliment_funny', 'compliment_hot', 'compliment_list', 'compliment_more', 'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_writer', 'cool', 'elite', 'fans', 'friends', 'funny', 'hours', 'postal_code', 'useful', 'user_id', 'yelping_since']


In [60]:
restaurants_df.head() 

Unnamed: 0,address,attributes,average_stars,business_id,categories,city,compliment_cool,compliment_cute,compliment_funny,compliment_hot,...,latitude,longitude,name,postal_code,review_count,stars,state,useful,user_id,yelping_since
1637167,1170 Queen Street W,"{'WiFi': ""u'no'"", 'BikeParking': 'True', 'Rest...",,NDuUMJfrWk52RA-H-OtrpA,"Juice Bars & Smoothies, Food, Restaurants, Fas...",Toronto,,,,,...,43.642889,-79.425429,Bolt Fresh Bar,M6J 1J5,57,3.0,ON,,,
1637311,241 W Charleston Blvd,"{'BusinessParking': ""{'garage': False, 'street...",,v7ZFEEqJBg_r8NCwHRP_2A,"Food, Pizza, Wine Bars, Bars, Restaurants, Nig...",Las Vegas,,,,,...,36.158264,-115.157967,Bistro Divino,89102,3,4.5,NV,,,
1637502,12061 Perry Hwy,"{'RestaurantsTakeOut': 'True', 'GoodForKids': ...",,El6Bum4I-0VWJRgzbtscJQ,"Fast Food, Sandwiches, Restaurants, Food, Food...",Wexford,,,,,...,40.632056,-80.058551,Jimmy John's,15090,11,3.0,PA,,,
1637809,1150 College Street,"{'BusinessParking': ""{'garage': False, 'street...",,4w-q_Wc77JzQZSIehKFTzA,"Juice Bars & Smoothies, Restaurants, Sandwiche...",Toronto,,,,,...,43.652361,-79.433924,Brockton Haunt,M6H 1B6,25,4.0,ON,,,
1638108,541-543 Carnot Rd,"{'BusinessAcceptsCreditCards': 'True', 'GoodFo...",,5G9aIrRiCD4TZCC5j2D39A,"Bagels, Donuts, Coffee & Tea, Breakfast & Brun...",Moon,,,,,...,40.515767,-80.221657,Dunkin' Donuts,15108,8,4.0,PA,,,


In [69]:
def reduce_review(df, business_list):
    
    #drop columns where business_id or user_id is null
    df.dropna(subset=['business_id','user_id'], how='any',inplace=True)
    
    #restrict to businesses that are restaurants
    df = df[df['business_id'].isin(business_list)]
    
    #only keep user_id's with more than one review
    df = df[df.groupby('user_id').user_id.transform(len) > 1]
    
    #verify this worked by taking the minimum amount of user_id counts
    print('The minimum amount of reviews per user is {}'
          .format(np.min(df.groupby('user_id')['business_id'].count())))
    return df

In [70]:
df_reviewSample = pd.read_csv('data/review_sample.csv',encoding = "ISO-8859-1",index_col=0)
restaurant_reviews = reduce_review(df_reviewSample, restaurants['business_id']) #create dataframe
# _, _ = find_ftres_with_nan(restaurant_reviews) #report if there are null values

# restaurant_reviews['review_date'] = pd.to_datetime(restaurant_reviews['date'])
# restaurant_reviews['review_year'] = restaurant_reviews['review_date'].dt.year
# restaurant_reviews['review_month'] = restaurant_reviews['review_date'].dt.month
# restaurant_reviews['review_weekday'] = restaurant_reviews['review_date'].dt.weekday_name

# rename_cols = {'cool': 'review_cool','funny':'review_funny','stars':'review_stars','useful':'review_useful'}
# restaurant_reviews.rename(columns=rename_cols, inplace=True)
# review_cols_to_drop = ['text', 'review_date', 'date']
# restaurant_reviews.drop(review_cols_to_drop, axis=1, inplace=True)

NameError: name 'restaurants' is not defined

In [65]:
df_reviewSample.count()

business_id    668330
cool           668330
date           668330
funny          668330
review_id      668330
stars          668330
text           668330
useful         668330
user_id        668330
dtype: int64

In [71]:
restaurants.count()

NameError: name 'restaurants' is not defined