In [2]:
import os
import io
import json
import gzip
from collections import defaultdict
#default dict if you access a random nonexisting key, it will return a 0 instead of generating an error
import numpy as np
import random
#from google.colab import drive
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
#users not likely to review lukewarm experiences on business
#users not likely to review bad businesses
business_df_filepath = 'drive/My Drive/GoogleLocal/places.clean.json.gz'
users_df_filepath = 'drive/My Drive/GoogleLocal/users.clean.json.gz'

In [4]:
#Read in zipped json files
def readGz(fname):
    gz = gzip.open(fname, 'rb')
    f = io.BufferedReader(gz)
    data = []
    
    for l in f.readlines():
        data.append(eval(l))
        
    gz.close()
    return data
#     for l in gzip.open(f):
#         yield eval(l)
    
#Full Data Sets    
#business_df = readGz(business_df_filepath)
#users_df = readGz(users_df_filepath)

In [6]:
#Work with sample of data due to memory limitations
#Local directory
reviews_samp_df = pd.read_json('C:/Users/buwen/Documents/sample.reviews.json')

In [8]:
#Get a list of people who have written more than 5 reviews
filtered_names = reviews_samp_df['reviewerName'].value_counts()[reviews_samp_df['reviewerName'].value_counts() > 5].index.tolist()

In [9]:
#Subet data for the people who have written more than 5 reviews
filtered_df = reviews_samp_df.loc[reviews_samp_df['reviewerName'].isin(filtered_names)]

In [14]:
#In order to make similarity matrix, we remove duplicate names and take the maximum rating of the duplicates. 
cleaned_df = filtered_df.groupby(['gPlusPlaceId', 'reviewerName']).max()['rating'].reset_index()

In [16]:
cleaned_df.head()

Unnamed: 0,gPlusPlaceId,reviewerName,rating
0,1.000009e+20,Athena Chang,4
1,1.000012e+20,Michael W.,4
2,1.000017e+20,Joseph Lee,1
3,1.000021e+20,Kevin Smith,1
4,1.000023e+20,Sylvain DRUELLE,5


In [17]:
#Pivot the dataframe to have the businesses be the columns and the users to be the observations and fill the values
#with the ratings 
pivoted_df = cleaned_df.pivot(index = "reviewerName", columns = "gPlusPlaceId", values = "rating").fillna(0)

In [19]:
pivoted_df.head()

gPlusPlaceId,1.0000085086337006e+20,1.0000119333884451e+20,1.000017139460348e+20,1.0000214164783343e+20,1.0000230763784749e+20,1.0000238528695655e+20,1.0000241406365308e+20,1.0000275999952539e+20,1.0000278495849454e+20,1.0000311449986875e+20,...,1.1844290647923281e+20,1.1844433322236784e+20,1.1844497328533448e+20,1.1844498470531131e+20,1.1844506475157212e+20,1.1844513412139749e+20,1.1844581880995194e+20,1.184462207652354e+20,1.1844639133267388e+20,1.1844663719467372e+20
reviewerName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Google User,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Smith,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AHsuan Chen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALEXANDER PRIME,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALOK KUMAR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
cosine_sim = 1 - pairwise_distances(pivoted_df, metric = 'cosine')

In [28]:
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [29]:
#Create cosine simliarity matrix 
cos_sim_df = cosine_similarity(pivoted_df)

In [30]:
cos_sim_df = pd.DataFrame(cos_sim_df)

In [31]:
#Check the length of cosine similarity matrix
len(cos_sim_df)

3937

In [32]:
cos_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3927,3928,3929,3930,3931,3932,3933,3934,3935,3936
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
#Function that finds similar rows 
def find_similar_rows(dataframe):
  index_list = []
  for index in dataframe.columns:
    if len(dataframe.loc[(dataframe[index] > 0) & dataframe[index] != 1]) > 1:
      index_list.append(index)
  return index_list

In [34]:
#Function that uses kNN to find the 3 nearest neighbors (3 similar users)
def findsimilar(user_id, ratings, metric = 'cosine', k=3):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute') 
    model_knn.fit(ratings)

    distances, indices = model_knn.kneighbors(ratings.iloc[user_id-1, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1-distances.flatten()
    print (('{0} most similar users for User {1}:\n').format(k,user_id))
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]+1 == user_id:
            continue;

        else:
            print (('{0}: User {1}, with similarity of {2}').format(i, indices.flatten()[i]+1, similarities.flatten()[i]))
            
    return similarities,indices

In [35]:
#Test: Find 3 similar users for a specified user using the kNN function written above
similarities,indices = findsimilar(120,test)

3 most similar users for User 120:

1: User 391, with similarity of 0.22958996631814121
2: User 984, with similarity of 0.015250371939424023
3: User 2648, with similarity of 0.006343127991661146


In [36]:
#Function that recommends similar businesses 
def recommend_similar_businesses(similarities, indices):
  similar_user_counter = 0
  if similarities[1] > .20:
    correct_user_index = indices[0][1] - 1 
    google_user_name = pivoted_df.iloc[correct_user_index].name
    #finding list of business IDs
    recommended_businesses = cleaned_df.loc[(cleaned_df['reviewerName'] == google_user_name) & (cleaned_df['rating'] >= 4)]['gPlusPlaceId'].tolist()
    
    #finding business categories
    return_df = filtered_df.loc[filtered_df['gPlusPlaceId'].isin(recommended_businesses)][['gPlusPlaceId', 'categories']]
    similar_user_counter +=1 
    if similarities[2] > .20:
        correct_user_index = indices[0][2] - 1 
        google_user_name = pivoted_df.iloc[correct_user_index].name
        #finding list of business IDs
        recommended_businesses = cleaned_df.loc[(cleaned_df['reviewerName'] == google_user_name) & (cleaned_df['rating'] >= 4)]['gPlusPlaceId'].tolist()

        #finding business categories
        return_df2 = filtered_df.loc[filtered_df['gPlusPlaceId'].isin(recommended_businesses)][['gPlusPlaceId', 'categories']]
        return_df = pd.concat([preturn_df, return_df2])
        similar_user_counter +=1 
        return return_df
    else:
        return return_df
  else:
    print('No Similar Businesses Found')

In [41]:
#Test business recommending function 
test = recommend_similar_businesses(similarities, indices)

In [43]:
test 

Unnamed: 0,gPlusPlaceId,categories
4845,1.11385e+20,"[Fish & Chips Restaurant, Fast Food Restaurant..."
13482,1.150198e+20,"[Sushi Restaurant, Japanese Restaurant, Takeou..."
111994,1.129165e+20,"[Vietnamese Restaurant, Asian Restaurant, Sout..."
239870,1.119537e+20,[Steak House]
330633,1.142326e+20,"[Cave, Tourist Attraction]"


In [53]:
#Grab a random user's name
google_user_name = pivoted_df.iloc[119].name

In [54]:
google_user_name

'Alex Henderson'

In [63]:
#Create dataframe that has all the reviews that are  or 5 stars from this particular user 
test2 = filtered_df.loc[(filtered_df['reviewerName'] == google_user_name) & (filtered_df['rating'] > 3)]

In [64]:
test2

Unnamed: 0,categories,gPlusPlaceId,gPlusUserId,rating,reviewText,reviewTime,reviewerName,unixReviewTime
161073,"[Gastropub, Wine Bar, American Restaurant]",1.147667e+20,1.062486e+20,5,,"May 5, 2012",Alex Henderson,1336249000.0
179930,[Pub],1.156924e+20,1.062486e+20,5,"Really great cocktails, served by attentive an...","Aug 28, 2013",Alex Henderson,1377693000.0
191816,[Restaurant],1.038538e+20,1.062486e+20,5,"Food almost as good as Truffles next door, but...","Nov 2, 2010",Alex Henderson,1288693000.0
285652,[Bar],1.075091e+20,1.062486e+20,4,"Imaginative cocktails, and a lively place. A b...","Aug 28, 2013",Alex Henderson,1377693000.0
291200,"[Italian Restaurant, European Restaurant, Bar]",1.094746e+20,1.062486e+20,5,This place is fantastic. They are so focussed ...,"May 14, 2011",Alex Henderson,1305402000.0
376451,"[SCUBA Instructor, SCUBA Tour Agency, Vacation...",1.132213e+20,1.062486e+20,5,These wonderful people looked after us during ...,"Oct 3, 2011",Alex Henderson,1317708000.0
464362,"[Asian Restaurant, South Asian Restaurant, Ind...",1.097801e+20,1.062486e+20,5,"Awesome. Delicious, imaginative, tasty, subtle...","Dec 17, 2010",Alex Henderson,1292615000.0
487810,[Cafes and Snack Bars],1.053797e+20,1.062486e+20,4,"Interesting music/gigs, good range of (largely...","Nov 2, 2010",Alex Henderson,1288694000.0


In [65]:
#Explode the list array in categories to create a tall dataframe
tall_1 = pd.concat([pd.Series(row['gPlusPlaceId'], row['categories']) for _, row in test2.iterrows()]).reset_index()

In [66]:
tall_1

Unnamed: 0,index,0
0,Gastropub,1.147667e+20
1,Wine Bar,1.147667e+20
2,American Restaurant,1.147667e+20
3,Pub,1.156924e+20
4,Restaurant,1.038538e+20
5,Bar,1.075091e+20
6,Italian Restaurant,1.094746e+20
7,European Restaurant,1.094746e+20
8,Bar,1.094746e+20
9,SCUBA Instructor,1.132213e+20


In [68]:
#Explode list array in categories to create tall dataframe 
tall_2 = pd.concat([pd.Series(row['gPlusPlaceId'], row['categories']) for _, row in test.iterrows()]).reset_index()

In [69]:
tall_2

Unnamed: 0,index,0
0,Fish & Chips Restaurant,1.11385e+20
1,Fast Food Restaurant,1.11385e+20
2,Fish and Chips Takeaway,1.11385e+20
3,Sushi Restaurant,1.150198e+20
4,Japanese Restaurant,1.150198e+20
5,Takeout Restaurant,1.150198e+20
6,Vietnamese Restaurant,1.129165e+20
7,Asian Restaurant,1.129165e+20
8,Southeast Asian Restaurant,1.129165e+20
9,Steak House,1.119537e+20


In [71]:
#This function checks to see how similar the categories are
#Then it stores the businesses that have high cosine similarity in a list
similar_list = []
similar_category = []
tfidf_vectorizer = TfidfVectorizer()
for category_1 in tall_1['index']:
  for category_2 in tall_2['index']:
    string_list = [category_1, category_2]
    tfidf_matrix = tfidf_vectorizer.fit_transform(string_list)
    result_cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
    if (result_cos[0][1] > .25):
      bus_id = tall_1.loc[tall_1['index'] == category_1][0].item()
      if (bus_id in similar_list):
        break
      else:
        similar_list.append(bus_id)

In [60]:
similar_list

[1.1476669018661673e+20,
 1.0385381981631395e+20,
 1.0947462732080633e+20,
 1.0978007964337838e+20]

In [61]:
#Show details about the businesses in the "similar list"
filtered_df.loc[filtered_df['gPlusPlaceId'].isin(similar_list)]

Unnamed: 0,categories,gPlusPlaceId,gPlusUserId,rating,reviewText,reviewTime,reviewerName,unixReviewTime
161073,"[Gastropub, Wine Bar, American Restaurant]",1.147667e+20,1.062486e+20,5,,"May 5, 2012",Alex Henderson,1336249000.0
191816,[Restaurant],1.038538e+20,1.062486e+20,5,"Food almost as good as Truffles next door, but...","Nov 2, 2010",Alex Henderson,1288693000.0
291200,"[Italian Restaurant, European Restaurant, Bar]",1.094746e+20,1.062486e+20,5,This place is fantastic. They are so focussed ...,"May 14, 2011",Alex Henderson,1305402000.0
464362,"[Asian Restaurant, South Asian Restaurant, Ind...",1.097801e+20,1.062486e+20,5,"Awesome. Delicious, imaginative, tasty, subtle...","Dec 17, 2010",Alex Henderson,1292615000.0
486720,"[Gastropub, Wine Bar, American Restaurant]",1.147667e+20,1.006018e+20,4,,"Oct 1, 2012",Ben Wallace,1349112000.0
