In [1]:
import time
import json
import re
import os 
import tensorflow as tf
import pandas as pd
import numpy as np
from itertools import compress
import matplotlib.pyplot as plt


In [2]:
def json_to_csv(directory, fileNames, createSample=False):
    """
    json_to_csv: loops through specified JSON files and converts them to csv files.
                 option to also create a sample csv, which uses np.random.seed 9001 to create a sample dataset with 10% of the observations
    
                 pandas has a read_json function, but returns a 'Trailing data error' when working with these specific files
                 
                 
    Inputs: -directory of JSON files
            -list of JSON filenames
            -createSample flag
            
    """
    
    start = time.time()

    jsonData = []

    for fileName in fileNames:
        with open(directory + fileName,  encoding="utf8") as file:
            print('{0} opened'.format(fileName))
            for line in file:
                #I use an rstrip here because some of the files have trailing blank spaces
                jsonData.append(json.loads(line.rstrip()))
        
        df = pd.DataFrame.from_dict(jsonData)
        
        csvFileName = fileName[:len(fileName)-5] + '.csv'
        
        df.to_csv(directory + csvFileName)
        print('{0} created'.format(csvFileName))
        
        
        if createSample:
            np.random.seed(9001)
            msk = np.random.rand(len(df)) <= 0.1
            sample = df[msk]
            
            csvSampleFileName = fileName[:len(fileName)-5] + '_sample.csv'
            
            sample.to_csv(directory + csvSampleFileName)
            print('{0} created'.format(csvSampleFileName))
        
    print('This function took {} minutes to run'.format((time.time()-start)/60))
    

In [None]:
df_business = pd.read_csv('data/business.csv')

#df_business.dropna(inplace=True, subset = ['categories'], axis=0)

# df_business.loc[df_business['categories'].str.contains('Restaurants')]

# df_business['categories'].value_counts()

In [None]:
df_business.head()

In [342]:
#df_user = pd.read_csv('data/user.csv', nrows = 100)

# df_review =  pd.read_csv('data/review.csv', usecols = ['business_id', 'user_id', 'stars'])
# user_cnts = df_review['user_id'].value_counts()
# top_users = user_cnts.loc[user_cnts>2].index
# df_review = df_review.loc[df_review['user_id'].isin(top_users)]
# df_review.to_csv('data/filtered_reviews.csv')

In [20]:
# reading filterdreview 

df_review = pd.read_csv('data/filtered_reviews.csv', index_col=0)

In [21]:
# inumerating business_in and user_id with bid and uid 
def build_fmap_invmap(ser):
    uni_ele = ser.unique()
    fmap = {v:i for i, v in enumerate(uni_ele)}
    invmap = {i:v for i, v in enumerate(uni_ele)}
    return fmap, invmap

In [22]:
# debuging enviroment on (dbg =1) to turn off (dbg = 0)
dbg = 1
if dbg:
    df_review = df_review.head(100000)

In [23]:
bus_fmap, bus_invmap = build_fmap_invmap(df_review['business_id'])
u_fmap, u_invmap = build_fmap_invmap(df_review['user_id'])

In [24]:
df_review['bid'] = df_review['business_id'].map(bus_fmap)

In [25]:
df_review['uid'] = df_review['user_id'].map(u_fmap)

In [53]:
df_review.head()

Unnamed: 0,business_id,stars,user_id,bid,uid
0,ujmEBvifdJM6h6RLv4wQIg,1.0,hG7b0MtEbXx5QzbzE6C_VA,0,0
2,WTqjgwHlXbSFevF32_DJVw,5.0,n6-Gk65cPZL6Uz8qRm3NYw,1,1
3,ikCg8xy5JIg_NGPx-MSIDA,5.0,dacAIZ6fTM6mqwW5uxkskg,2,2
6,3fw2X5bZYeW9xCz_zGhOHg,3.0,jlu4CztcSxrKx56ba1a5AQ,3,3
7,zvO-PJCpNk4fgAVUnExYAA,1.0,d6xvYpyzcfbF_AZ8vMB7QA,4,4


In [28]:
n_users, n_bus = df_review['uid'].nunique(), df_review['bid'].nunique()

In [29]:
n_dim = 5

In [30]:
user_vector_raw = tf.Variable(tf.random_uniform([n_users, n_dim], minval = -1., maxval = 1.))
bus_vector_raw = tf.Variable(tf.random_uniform([n_bus, n_dim], minval = -1., maxval = 1.))

user_vector = tf.tanh(user_vector_raw)
bus_vector = tf.tanh(bus_vector_raw)

users = tf.placeholder(tf.int32, shape=(None))
businesses = tf.placeholder(tf.int32, shape=(None))
ratings = tf.placeholder(tf.float32, shape=(None))

UserSampled = tf.nn.embedding_lookup(user_vector, users)
BusinessSampled = tf.nn.embedding_lookup(bus_vector, businesses)
UserSampled.set_shape([None, n_dim])
BusinessSampled.set_shape([None, n_dim])

# input tensors for products, users, ratings

Instructions for updating:
Colocations handled automatically by placer.


In [31]:
estimatedaffinitiesraw = tf.reduce_sum(UserSampled * BusinessSampled, 1)
estimatedaffinities = tf.sigmoid(estimatedaffinitiesraw)*5

In [32]:
loss = tf.reduce_sum(tf.square(estimatedaffinities - ratings))
opt = tf.train.RMSPropOptimizer(learning_rate=.1).minimize(loss)

Instructions for updating:
Use tf.cast instead.


In [33]:
sess = tf.Session()


In [34]:
rows = np.random.choice(df_review.shape[0], 64)

In [35]:
sess.run(tf.global_variables_initializer())

In [54]:
for i in range(10000):
    rows = np.random.choice(df_review.shape[0], 64)
    dfrows = df_review.iloc[rows]
    fd = {users:dfrows['uid'].values,
         businesses:dfrows['bid'].values,
         ratings:dfrows['stars'].values}
    _, l2loss = sess.run([opt, loss], fd)
    if i % 1000 == 0:
        print(l2loss)

51.51345
109.24071
30.983854
40.328747
38.272476
14.681086
25.052807
17.334026
29.805984
43.593056


In [37]:
user_values, bus_values = sess.run([user_vector, bus_vector])

In [57]:
bus_vec_df = pd.DataFrame(data = bus_values, index = [bus_invmap[i] for i in range(n_bus)])

In [58]:
bus_vec_df

Unnamed: 0,0,1,2,3,4
ujmEBvifdJM6h6RLv4wQIg,0.245716,-0.999963,0.958518,-0.996415,-0.004232
WTqjgwHlXbSFevF32_DJVw,0.637304,-0.794876,-0.613295,-0.997671,0.675887
ikCg8xy5JIg_NGPx-MSIDA,0.902912,0.856310,-0.948628,-0.111182,-0.746762
3fw2X5bZYeW9xCz_zGhOHg,-0.734276,-0.999902,0.797500,0.541766,0.030535
zvO-PJCpNk4fgAVUnExYAA,-0.223762,-0.782820,-0.118088,0.958683,-0.634694
b2jN2mm9Wf3RcrZCgfo1cg,-0.911307,-0.496665,0.940446,-0.999844,0.997965
oxwGyA17NL6c5t1Etg5WgQ,-0.956273,-0.793331,0.168674,0.815644,-0.995229
8mIrX_LrOnAqWsB5JrOojQ,-1.000000,1.000000,-0.694195,-0.999999,0.999981
FxLfqxdYPA6Z85PFKaqLrg,0.194422,0.627261,-0.602485,0.552737,0.970568
AakkkTuGZA2KBodKi2_u8A,-0.366006,-0.511513,0.989328,0.999454,-0.254088


In [38]:
uid = 115
u_invmap[uid]

'ri7itn7-CdpsaPxTToK5cQ'

In [39]:
bname = 'cHdJXLlKNWixBXpDwEGb_A'
bid = bus_fmap[bname]

In [40]:
bid

12

In [41]:
chipotle = bus_values[bid]

In [42]:
chipotle

array([-0.9999191 , -0.9999983 , -0.501293  , -0.21733429,  0.9575534 ],
      dtype=float32)

In [43]:
japaneselover = user_values[uid]

In [44]:
bus_values

array([[ 0.24571638, -0.99996334,  0.95851815, -0.99641514, -0.00423152],
       [ 0.63730407, -0.7948759 , -0.6132946 , -0.99767065,  0.67588747],
       [ 0.9029117 ,  0.8563105 , -0.9486281 , -0.1111819 , -0.74676174],
       ...,
       [ 0.7618762 , -0.27237293,  0.6265953 ,  0.20628403, -0.04141279],
       [-0.02541569, -0.99514157,  0.01158163,  0.9671731 , -0.11740661],
       [ 0.94820774, -0.46000168, -0.8827787 , -0.16168922, -0.7653428 ]],
      dtype=float32)

In [45]:
np.square(bus_values - japaneselover[None,:]).sum(1).argsort()

array([12674,  3277,  5170, ...,   287,  4883,  3183])

In [46]:
np.square(bus_values - chipotle[None,:]).sum(1).argsort()

array([  12, 1851, 3402, ...,   36, 4045, 4252])

In [47]:
def closest_businesses_to(business = None, user = None, df = None):
    if business is not None:
        target = bus_values[bus_fmap[business]]
    if user is not None:
        target = user_values[u_fmap[user]]
    if df is None:
        df = bus_values
    best_restaurants = np.square(df - target[None,:]).sum(1).argsort()
    return best_restaurants

In [48]:
midtown_japanese_restaurants = bus_values[:30,:]

In [49]:
closest_businesses_to(business = 'cHdJXLlKNWixBXpDwEGb_A')

array([  12, 1851, 3402, ...,   36, 4045, 4252])

In [50]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ')

array([12674,  3277,  5170, ...,   287,  4883,  3183])

In [51]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ', df = midtown_japanese_restaurants)

array([17, 26, 18,  0, 11,  2, 29,  1, 21, 23, 10,  4, 28, 13, 25,  6, 14,
        3,  9, 20,  5,  8, 12,  7, 15, 27, 19, 22, 24, 16])

In [52]:
# from surprise.model_selection import train_test_split


In [285]:
# reader = Reader(rating_scale=(0, 5))
# data = Dataset.load_from_df(df_review[[
#     'business_id', 'stars', 'user_id']], reader)
#data2 = Dataset.load_builtin('ml-100k')

In [None]:
# trainset = data.build_full_trainset()

In [None]:
#trainset, testset = train_test_split(data, test_size=.25)

In [None]:
# model = NMF(n_factors = 5, n_epochs = 2)

In [None]:
# modeltrained = model.fit(trainset)

In [None]:
# sim = modeltrained.compute_similarities()

In [None]:
# sim

In [None]:
# import tensorflow as tf

In [None]:
# 3

In [None]:
# ??modeltrained.compute_similarities

In [None]:
# ??NMF

In [None]:
# benchmark = []
# # Iterate over all algorithms
# for algorithm in [NMF()]:
#     # Perform cross validation
#     results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
#     # Get results & append algorithm name
#     tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#     tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
#     benchmark.append(tmp)
    
# pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

In [None]:
#df_review.fillna(df_review.mean(), inplace=True)

In [None]:
#print(df_review.dtypes)