In [231]:
import time
import json
import pandas as pd
import numpy as np
import re
from itertools import compress
import matplotlib.pyplot as plt


In [232]:
def json_to_csv(directory, fileNames, createSample=False):
    """
    json_to_csv: loops through specified JSON files and converts them to csv files.
                 option to also create a sample csv, which uses np.random.seed 9001 to create a sample dataset with 10% of the observations
    
                 pandas has a read_json function, but returns a 'Trailing data error' when working with these specific files
                 
                 
    Inputs: -directory of JSON files
            -list of JSON filenames
            -createSample flag
            
    """
    
    start = time.time()

    jsonData = []

    for fileName in fileNames:
        with open(directory + fileName,  encoding="utf8") as file:
            print('{0} opened'.format(fileName))
            for line in file:
                #I use an rstrip here because some of the files have trailing blank spaces
                jsonData.append(json.loads(line.rstrip()))
        
        df = pd.DataFrame.from_dict(jsonData)
        
        csvFileName = fileName[:len(fileName)-5] + '.csv'
        
        df.to_csv(directory + csvFileName)
        print('{0} created'.format(csvFileName))
        
        
        if createSample:
            np.random.seed(9001)
            msk = np.random.rand(len(df)) <= 0.1
            sample = df[msk]
            
            csvSampleFileName = fileName[:len(fileName)-5] + '_sample.csv'
            
            sample.to_csv(directory + csvSampleFileName)
            print('{0} created'.format(csvSampleFileName))
        
    print('This function took {} minutes to run'.format((time.time()-start)/60))
    

In [233]:
# fileNameList = ['user.json',
#                 'business.json', 
#                 'review.json']

# json_to_csv('data/', fileNameList, createSample=True)

In [234]:
# df_business = pd.read_json('data/business.json',lines=True)

# df_business.dropna(inplace=True, subset = ['categories'], axis=0)

# df_business.loc[df_business['categories'].str.contains('Restaurants')]

# df_business['categories'].value_counts()

In [235]:
#df_user = pd.read_csv('data/user.csv', nrows = 100)

In [236]:
import os

In [237]:
os.listdir('data/')

['filtered_reviews.csv',
 '.DS_Store',
 'business.json',
 'review_sample.csv',
 'business.csv',
 'user.json',
 'review.csv',
 'user.csv',
 'business_sample.csv',
 'user_sample.csv',
 'review.json',
 'restaurants.csv']

In [238]:
# df_review =  pd.read_csv('data/review.csv', usecols = ['business_id', 'user_id', 'stars'])



# user_cnts = df_review['user_id'].value_counts()
# top_users = user_cnts.loc[user_cnts>2].index
# df_review = df_review.loc[df_review['user_id'].isin(top_users)]
# df_review.to_csv('data/filtered_reviews.csv')

In [239]:
df_review = pd.read_csv('data/filtered_reviews.csv', index_col=0)

In [240]:
# df_review

# df_review['business_id'].value_counts().clip(lower=0,upper=10).hist()

# df_review['user_id'].value_counts().clip(lower=0,upper=10).hist()

# df_review.head()

In [241]:
# def create_matrix(df_review, n_users, n_items):
#     matrix = np.zeros((n_users, n_items))
    
#     for line in df.interlupes():
        
#         matrix[line[1]-1, line[2]-1] = line [3]
        
#     return matrix


In [242]:
# R_df = df_review.pivot(index = 'user_id', columns ='business_id', values = 'stars').fillna(0)
# R_df.head()



In [243]:
# from surprise import SVD
# from surprise import Dataset
# from surprise import NormalPredictor
# from surprise import BaselineOnly
# from surprise import KNNBasic
# from surprise import Reader
# from surprise.model_selection import cross_validate
# from surprise import NMF


In [244]:
def build_fmap_invmap(ser):
    uni_ele = ser.unique()
    fmap = {v:i for i, v in enumerate(uni_ele)}
    invmap = {i:v for i, v in enumerate(uni_ele)}
    return fmap, invmap

In [245]:
df_review.shape

(5284463, 3)

In [246]:
df_review = df_review.head(100000)

In [247]:
bus_fmap, bus_invmap = build_fmap_invmap(df_review['business_id'])
u_fmap, u_invmap = build_fmap_invmap(df_review['user_id'])

In [248]:
u_invmap[
    u_fmap[df_review['user_id'][0]]
  ], df_review['user_id'][0]




('hG7b0MtEbXx5QzbzE6C_VA', 'hG7b0MtEbXx5QzbzE6C_VA')

In [249]:
bus_invmap[bus_fmap[df_review['business_id'].iloc[20]]], df_review['business_id'][0]



('hd5xm20tfSa70-6UqD9-bg', 'ujmEBvifdJM6h6RLv4wQIg')

In [250]:
df_review['business_id'] = df_review['business_id'].map(bus_fmap)

In [251]:
df_review['user_id'] = df_review['user_id'].map(u_fmap)

In [252]:
df_review.head()

Unnamed: 0,business_id,stars,user_id
0,0,1.0,0
2,1,5.0,1
3,2,5.0,2
6,3,3.0,3
7,4,1.0,4


In [253]:
import tensorflow as tf

In [254]:
n_users, n_bus = df_review['user_id'].nunique(), df_review['business_id'].nunique()

In [255]:
n_dim = 5

In [256]:
import tensorflow as tf

In [257]:
user_vector_raw = tf.Variable(tf.random_uniform([n_users, n_dim], minval = -1., maxval = 1.))
bus_vector_raw = tf.Variable(tf.random_uniform([n_bus, n_dim], minval = -1., maxval = 1.))

In [258]:
user_vector = tf.tanh(user_vector_raw)
bus_vector = tf.tanh(bus_vector_raw)

In [259]:
user_vector_raw.shape, bus_vector_raw.shape

(TensorShape([Dimension(72254), Dimension(5)]),
 TensorShape([Dimension(13589), Dimension(5)]))

In [260]:

users = tf.placeholder(tf.int32, shape=(None))
businesses = tf.placeholder(tf.int32, shape=(None))
ratings = tf.placeholder(tf.float32, shape=(None))

UserSampled = tf.nn.embedding_lookup(user_vector, users)
BusinessSampled = tf.nn.embedding_lookup(bus_vector, businesses)
UserSampled.set_shape([None, n_dim])
BusinessSampled.set_shape([None, n_dim])

# input tensors for products, users, ratings

In [261]:
estimatedaffinitiesraw = tf.reduce_sum(UserSampled * BusinessSampled, 1)
estimatedaffinities = tf.sigmoid(estimatedaffinitiesraw)*5

In [262]:
loss = tf.reduce_sum(tf.square(estimatedaffinities - ratings))
opt = tf.train.RMSPropOptimizer(learning_rate=.1).minimize(loss)

In [263]:
sess = tf.Session()


In [264]:
rows = np.random.choice(df_review.shape[0], 64)

In [265]:
sess.run(tf.global_variables_initializer())

In [266]:
for i in range(10000):
    rows = np.random.choice(df_review.shape[0], 64)
    dfrows = df_review.iloc[rows]
    fd = {users:dfrows['user_id'].values,
         businesses:dfrows['business_id'].values,
         ratings:dfrows['stars'].values}
    _, l2loss = sess.run([opt, loss], fd)
    if i % 1000 == 0:
        print(l2loss)

213.85464
191.96246
106.979126
70.572464
68.17617
89.1447
80.94121
37.172424
38.754475
13.047323


In [267]:
user_values, bus_values = sess.run([user_vector, bus_vector])

In [268]:
uid = 115
u_invmap[uid]

'ri7itn7-CdpsaPxTToK5cQ'

In [269]:
bname = 'cHdJXLlKNWixBXpDwEGb_A'
bid = bus_fmap[bname]

In [270]:
bid

12

In [271]:
chipotle = bus_values[bid]

In [272]:
chipotle

array([ 0.05631671, -0.9999996 ,  0.99999964,  0.99999875, -0.56979346],
      dtype=float32)

In [273]:
japaneselover = user_values[uid]

In [274]:
bus_values

array([[ 0.309441  ,  0.99999636, -0.9999997 , -0.03689282,  0.9999293 ],
       [ 0.99989724,  0.99419105, -0.99857646, -0.99953157,  0.9925304 ],
       [ 0.9893647 ,  0.8844984 , -0.95377344,  0.9483867 ,  0.5740688 ],
       ...,
       [ 0.2902269 , -0.33366165, -0.7967299 ,  0.80802816, -0.06317686],
       [-0.9701469 , -0.22255358,  0.99214655, -0.94130254,  0.9685952 ],
       [ 0.10152043,  0.64527667,  0.6886667 ,  0.91604906, -0.44579545]],
      dtype=float32)

In [275]:
np.square(bus_values - japaneselover[None,:]).sum(1).argsort()

array([ 9872,   523, 11100, ...,  2571,  1882,  1671])

In [276]:
np.square(bus_values - chipotle[None,:]).sum(1).argsort()

array([   12, 11899,    91, ...,  1652,   663,  1805])

In [277]:
def closest_businesses_to(business = None, user = None, df = None):
    if business is not None:
        target = bus_values[bus_fmap[business]]
    if user is not None:
        target = user_values[u_fmap[user]]
    if df is None:
        df = bus_values
    best_restaurants = np.square(df - target[None,:]).sum(1).argsort()
    return best_restaurants

In [278]:
midtown_japanese_restaurants = bus_values[:30,:]

In [279]:
closest_businesses_to(business = 'cHdJXLlKNWixBXpDwEGb_A')

array([   12, 11899,    91, ...,  1652,   663,  1805])

In [280]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ')

array([ 9872,   523, 11100, ...,  2571,  1882,  1671])

In [281]:
closest_businesses_to(user = 'ri7itn7-CdpsaPxTToK5cQ', df = midtown_japanese_restaurants)

array([ 4, 18,  3,  9, 21, 26, 13, 11, 24, 27, 12,  8, 22, 10, 28,  6,  5,
       15, 16,  1, 25,  7, 17,  0,  2, 23, 14, 19, 29, 20])

In [284]:
# from surprise.model_selection import train_test_split


In [285]:
# reader = Reader(rating_scale=(0, 5))
# data = Dataset.load_from_df(df_review[[
#     'business_id', 'stars', 'user_id']], reader)
#data2 = Dataset.load_builtin('ml-100k')

In [None]:
# trainset = data.build_full_trainset()

In [None]:
#trainset, testset = train_test_split(data, test_size=.25)

In [None]:
# model = NMF(n_factors = 5, n_epochs = 2)

In [None]:
# modeltrained = model.fit(trainset)

In [None]:
# sim = modeltrained.compute_similarities()

In [None]:
# sim

In [None]:
# import tensorflow as tf

In [None]:
# 3

In [None]:
# ??modeltrained.compute_similarities

In [None]:
# ??NMF

In [None]:
# benchmark = []
# # Iterate over all algorithms
# for algorithm in [NMF()]:
#     # Perform cross validation
#     results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
#     # Get results & append algorithm name
#     tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#     tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
#     benchmark.append(tmp)
    
# pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

In [None]:
#df_review.fillna(df_review.mean(), inplace=True)

In [None]:
#print(df_review.dtypes)