In [2]:
import time
import json
import re
import os 
import tensorflow as tf
import pandas as pd
import numpy as np
from itertools import compress
import matplotlib.pyplot as plt


In [2]:
def json_to_csv(directory, fileNames, createSample=False):
    """
    json_to_csv: loops through specified JSON files and converts them to csv files.
                 option to also create a sample csv, which uses np.random.seed 9001 to create a sample dataset with 10% of the observations
    
                 pandas has a read_json function, but returns a 'Trailing data error' when working with these specific files
                 
                 
    Inputs: -directory of JSON files
            -list of JSON filenames
            -createSample flag
            
    """
    
    start = time.time()

    jsonData = []

    for fileName in fileNames:
        with open(directory + fileName,  encoding="utf8") as file:
            print('{0} opened'.format(fileName))
            for line in file:
                #I use an rstrip here because some of the files have trailing blank spaces
                jsonData.append(json.loads(line.rstrip()))
        
        df = pd.DataFrame.from_dict(jsonData)
        
        csvFileName = fileName[:len(fileName)-5] + '.csv'
        
        df.to_csv(directory + csvFileName)
        print('{0} created'.format(csvFileName))
        
        
        if createSample:
            np.random.seed(9001)
            msk = np.random.rand(len(df)) <= 0.1
            sample = df[msk]
            
            csvSampleFileName = fileName[:len(fileName)-5] + '_sample.csv'
            
            sample.to_csv(directory + csvSampleFileName)
            print('{0} created'.format(csvSampleFileName))
        
    print('This function took {} minutes to run'.format((time.time()-start)/60))
    

In [3]:
df_business = pd.read_json('data/business.json', lines=True)

# df_business.dropna(inplace=True, subset = ['categories'], axis=0)

# df_business.loc[df_business['categories'].str.contains('Restaurants')]

# df_business['categories'].value_counts()

In [4]:
df_business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,2818 E Camino Acequia Drive,{'GoodForKids': 'False'},1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC
3,"15655 W Roosevelt St, Ste 237",,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC


In [5]:
#df_user = pd.read_csv('data/user.csv', nrows = 100)

# df_review =  pd.read_csv('data/review.csv', usecols = ['business_id', 'user_id', 'stars'])
# user_cnts = df_review['user_id'].value_counts()
# top_users = user_cnts.loc[user_cnts>2].index
# df_review = df_review.loc[df_review['user_id'].isin(top_users)]
# df_review.to_csv('data/filtered_reviews.csv')

In [6]:
# reading filterdreview 

df_review = pd.read_csv('data/filtered_reviews.csv', index_col=0)

  mask |= (ar1 == a)


In [7]:
# inumerating business_in and user_id with bid and uid 
def build_fmap_invmap(ser):
    uni_ele = ser.unique()
    fmap = {v:i for i, v in enumerate(uni_ele)}
    invmap = {i:v for i, v in enumerate(uni_ele)}
    return fmap, invmap

In [8]:
# setting debuging enviroment on (dbg =1) to turn it off (dbg = 0)
dbg = 1
if dbg:
    df_review = df_review.head(100000)

In [9]:
bus_fmap, bus_invmap = build_fmap_invmap(df_review['business_id'])
u_fmap, u_invmap = build_fmap_invmap(df_review['user_id'])

In [10]:
df_review['bid'] = df_review['business_id'].map(bus_fmap)

In [11]:
df_review['uid'] = df_review['user_id'].map(u_fmap)

In [12]:
df_review.head()

Unnamed: 0,business_id,stars,user_id,bid,uid
0,ujmEBvifdJM6h6RLv4wQIg,1.0,hG7b0MtEbXx5QzbzE6C_VA,0,0
2,WTqjgwHlXbSFevF32_DJVw,5.0,n6-Gk65cPZL6Uz8qRm3NYw,1,1
3,ikCg8xy5JIg_NGPx-MSIDA,5.0,dacAIZ6fTM6mqwW5uxkskg,2,2
6,3fw2X5bZYeW9xCz_zGhOHg,3.0,jlu4CztcSxrKx56ba1a5AQ,3,3
7,zvO-PJCpNk4fgAVUnExYAA,1.0,d6xvYpyzcfbF_AZ8vMB7QA,4,4


In [13]:
n_users, n_bus = df_review['uid'].nunique(), df_review['bid'].nunique()

In [14]:
n_dim = 5

In [15]:
# Initializing tensor flow at a randon number | n_users * n_dim (layers) initializing at some random number between
# -1 to 1 both for business and users. For internal layer. 
# PS > Create a function with code below: 

user_vector_raw = tf.Variable(tf.random_uniform([n_users, n_dim], minval = -1., maxval = 1.))
bus_vector_raw = tf.Variable(tf.random_uniform([n_bus, n_dim], minval = -1., maxval = 1.))

# running the tanh function to find 
user_vector = tf.tanh(user_vector_raw)
bus_vector = tf.tanh(bus_vector_raw)

# Stipulating the imput layer. 
users = tf.placeholder(tf.int32, shape=(None))
businesses = tf.placeholder(tf.int32, shape=(None))
ratings = tf.placeholder(tf.float32, shape=(None))

UserSampled = tf.nn.embedding_lookup(user_vector, users)
BusinessSampled = tf.nn.embedding_lookup(bus_vector, businesses)
UserSampled.set_shape([None, n_dim])
BusinessSampled.set_shape([None, n_dim])

# input tensors for products, users, ratings

Instructions for updating:
Colocations handled automatically by placer.


In [16]:
# Defining the output
# transfer into a fucntion
estimatedaffinitiesraw = tf.reduce_sum(UserSampled * BusinessSampled, 1)
estimatedaffinities = tf.sigmoid(estimatedaffinitiesraw)*5

In [17]:
# estimatedaffinities - ratings ask Lee to clarify ratings, where that ratings comes from? ask to explain the loss function 
# transfer into a function 
loss = tf.reduce_sum(tf.square(estimatedaffinities - ratings))
opt = tf.train.RMSPropOptimizer(learning_rate=.1).minimize(loss)

Instructions for updating:
Use tf.cast instead.


In [3]:
# Setting the session and intialize it 

sess = tf.Session()


In [19]:
# picking up 64 randon rows in order to run under memory capacity
rows = np.random.choice(df_review.shape[0], 64)

In [20]:
sess.run(tf.global_variables_initializer())

In [21]:
# Creating a loop to train under 64 random rows 
for i in range(10000):
    rows = np.random.choice(df_review.shape[0], 64)
    dfrows = df_review.iloc[rows]
    fd = {users:dfrows['uid'].values,
         businesses:dfrows['bid'].values,
         ratings:dfrows['stars'].values}
    _, l2loss = sess.run([opt, loss], fd)
    if i % 1000 == 0:
        print(l2loss)

234.92917
139.65031
127.449135
129.67441
137.12938
105.95755
72.79036
38.120255
50.260414
17.09154


In [22]:
user_values, bus_values = sess.run([user_vector, bus_vector])

In [23]:
bus_vec_df = pd.DataFrame(data = bus_values, index = 
                          [bus_invmap[i] for i in range(n_bus)])



In [24]:
bus_vec_df

Unnamed: 0,0,1,2,3,4
ujmEBvifdJM6h6RLv4wQIg,-0.999168,0.968609,0.999999,-0.999999,-0.999950
WTqjgwHlXbSFevF32_DJVw,-0.999977,0.968243,0.902691,-0.999967,-0.999548
ikCg8xy5JIg_NGPx-MSIDA,0.971469,-0.992822,-0.997513,-0.999425,-0.978776
3fw2X5bZYeW9xCz_zGhOHg,0.323263,-0.985096,-0.228673,0.854754,0.924128
zvO-PJCpNk4fgAVUnExYAA,0.322046,0.908827,-0.860200,0.613535,-0.778720
b2jN2mm9Wf3RcrZCgfo1cg,0.999895,0.949014,0.999095,-0.999683,-0.999955
oxwGyA17NL6c5t1Etg5WgQ,-1.000000,-0.380686,-0.512891,0.999999,0.231811
8mIrX_LrOnAqWsB5JrOojQ,0.999917,1.000000,0.999999,0.999981,-1.000000
FxLfqxdYPA6Z85PFKaqLrg,0.820339,-0.723934,-0.477904,-0.117361,-0.847304
AakkkTuGZA2KBodKi2_u8A,0.125206,-0.958670,-0.675109,0.721851,-0.472924


In [25]:
# Joining df_business + bus_vec_df
df_all = df_business.join(bus_vec_df, on='business_id', how='right')


In [26]:
# Dropping NaN columns 
df_all = df_all.dropna()

In [58]:
# Pulling user ID 4 and comparing to inverse map on uid (not sure why, maybe to check accuracy?)
uid = 4
u_invmap[uid]  

'd6xvYpyzcfbF_AZ8vMB7QA'

In [29]:
bname = 'cHdJXLlKNWixBXpDwEGb_A'
bid = bus_fmap[bname]

In [27]:
df_all.loc[df_all['categories'].str.contains('Restaurant') & 
           df_all['categories'].str.contains('Japanese')]

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state,0,1,2,3,4
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC,0.999998,0.999048,-0.999907,-0.980016,-0.999999
73,436 Market St,"{'OutdoorSeating': 'False', 'HasTV': 'True', '...",v-scZMU6jhnmV955RSzGJw,"Japanese, Sushi Bars, Restaurants",Pittsburgh,"{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'...",1,40.441062,-80.002126,No. 1 Sushi Sushi,15222,106,4.5,PA,0.997832,-0.999996,0.961566,0.999988,-0.999992
80,529-17th Avenue SW,"{'WiFi': ''free'', 'GoodForDancing': 'False', ...",PkDghu4aan2_wxrhXjTEgg,"Nightlife, Italian, Restaurants, Japanese, Lou...",Calgary,"{'Tuesday': '14:0-23:0', 'Wednesday': '14:0-23...",0,51.037778,-114.073351,MiraKuru,T2S 0A9,16,3.5,AB,-0.358603,0.691279,0.827270,0.171682,0.704840
185,2945 Lake Shore Boulevard,"{'RestaurantsDelivery': 'True', 'HasTV': 'True...",SJBzyJDCR_f6dx5tpYAABA,"Sushi Bars, Japanese, Restaurants",Toronto,"{'Monday': '16:0-22:0', 'Tuesday': '11:30-22:0...",1,43.600523,-79.505516,Kibo Sushi House,M8V 1J5,15,4.0,ON,0.921737,0.981064,-0.996601,0.999074,-0.355018
343,10624 S Eastern Ave,"{'RestaurantsTakeOut': 'True', 'NoiseLevel': '...",jX9DocoiY4Bo9EUkaTSqvg,"Restaurants, Hawaiian, Chinese, Japanese, Poke...",Henderson,"{'Monday': '0:0-0:0', 'Tuesday': '11:0-19:0', ...",1,35.998220,-115.102246,China AAA,89052,149,4.5,NV,0.999995,-0.999995,0.999900,0.999986,0.999994
346,1909 E Ray Rd,"{'RestaurantsDelivery': 'False', 'RestaurantsT...",ecJri9ozyke4dOCWulZiRQ,"Asian Fusion, Japanese, Restaurants, Ramen, Ta...",Chandler,"{'Monday': '17:0-21:30', 'Tuesday': '17:0-21:3...",1,33.320006,-111.809675,Nishikawa Ramen,85225,427,4.0,AZ,-0.735649,0.999997,1.000000,-0.999890,0.999997
444,3339 Boulevard des Sources,"{'RestaurantsDelivery': 'False', 'RestaurantsA...",4B8VnRAstRRshxiUzm9yPw,"Restaurants, Sushi Bars, Japanese",Dollard-des-Ormeaux,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-23:0'...",1,45.484315,-73.795652,Maiko Sushi - DDO,H9B 1Z8,51,4.0,QC,-0.999739,-0.999031,-0.989284,-0.999881,-0.999957
507,"6989 N Hayden Rd, Ste A12","{'RestaurantsAttire': ''casual'', 'OutdoorSeat...",QS3QxI7u5PRdtbGgI0-UsA,"Asian Fusion, Restaurants, Seafood, Sushi Bars...",Scottsdale,"{'Monday': '0:0-0:0', 'Tuesday': '17:30-21:0',...",1,33.538029,-111.905676,Sakana Sushi & Grill,85250,347,4.0,AZ,0.999997,-0.999997,-0.840149,0.999967,-0.631454
524,338-8338 18 Street SE,"{'GoodForMeal': '{'dessert': False, 'latenight...",bPBZEDuHbE-I7bxUWIYMhQ,"Japanese, Restaurants, Sushi Bars",Calgary,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,50.977178,-114.013257,Oishii Sushi,T2C 4E4,78,3.5,AB,0.999880,-0.185443,0.975919,-0.299196,0.981666
577,1418 Rue Cartier,"{'RestaurantsPriceRange2': '3', 'OutdoorSeatin...",9ELnhtgMF8_h8Zky4A7BSA,"Restaurants, Japanese",Montréal,"{'Monday': '17:0-22:0', 'Tuesday': '17:0-22:0'...",0,45.523226,-73.551979,Saiken Sushi,H2K 4C8,4,3.0,QC,0.223504,-0.513308,0.426439,0.308187,-0.606921


### Testing (Passing train) 


In [30]:
bid

12

In [31]:
chipotle = bus_values[bid]

In [32]:
chipotle

array([-0.9999997 ,  0.07804164,  0.9999998 , -0.99999785,  0.99974287],
      dtype=float32)

In [33]:
japaneselover = user_values[uid]

In [34]:
bus_values

array([[-0.999168  ,  0.96860886,  0.9999986 , -0.9999991 , -0.9999496 ],
       [-0.9999766 ,  0.9682425 ,  0.9026906 , -0.99996734, -0.99954826],
       [ 0.9714687 , -0.992822  , -0.9975128 , -0.99942505, -0.97877586],
       ...,
       [-0.23577082,  0.65202653, -0.6258598 , -0.7044947 ,  0.3569681 ],
       [-0.99639076,  0.9810038 , -0.9945443 ,  0.9665813 , -0.9906568 ],
       [ 0.9697955 ,  0.28652716, -0.33573326,  0.08056628, -0.14556506]],
      dtype=float32)

In [35]:
np.square(bus_values - japaneselover[None,:]).sum(1).argsort()

array([ 1467,  2335, 10094, ...,  1979,   692,  3971])

In [36]:
np.square(bus_values - chipotle[None,:]).sum(1).argsort()

array([   12,  3771,  8903, ..., 10703,  2192,  4346])

In [37]:
def closest_businesses_to(business = None, user = None, df = None):
    if business is not None:
        target = bus_values[bus_fmap[business]]
    if user is not None:
        target = user_values[u_fmap[user]]
    if df is None:
        df = bus_values
    best_restaurants = np.square(df - target[None,:]).sum(1).argsort()
    return best_restaurants

In [38]:
midtown_japanese_restaurants = bus_values[:30,:]

In [39]:
closest_businesses_to(business = 'cHdJXLlKNWixBXpDwEGb_A')

array([   12,  3771,  8903, ..., 10703,  2192,  4346])

In [40]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ')

array([ 1467,  2335, 10094, ...,  1979,   692,  3971])

In [41]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ', df = midtown_japanese_restaurants)

array([26, 10, 12,  9,  6,  8, 13, 29,  3, 21,  1,  0, 24, 11, 20, 22, 27,
       28, 18,  2, 19, 15,  4, 23,  5, 25, 14, 16,  7, 17])