In [1]:
import numpy as np
import pandas as pd
import json 
import matplotlib as plt
%matplotlib inline
import pickle
import operator
from math import sin, cos, sqrt, atan2, radians

### Loading all of the data structures from pickle files

In [10]:
with open('pickle/userStateAllRestaurantdict.pickle', 'rb') as handle:
    userStateAllRestaurantdict = pickle.load(handle)
    
with open('pickle/userRestaurantDict.pickle', 'rb') as handle:
    userRestaurantDict = pickle.load(handle)

### Dict

> **userStateAllRestaurantdict**     
* key : (user_id , state)
* value: [list of all restaurants in that state]


> **userRestaurantDict**
* key : user_id
* value : [list of restaurants visited by that user]

In [33]:
# Function to calculate distance between two location
def getDistance(lt1, lg1, lt2, lg2):
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lt1)
    lon1 = radians(lg1)
    lat2 = radians(lt2)
    lon2 = radians(lg2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

In [12]:
# Function to get restaurants form a state with in 'radious' from given location.
def getNearByRestaurants( lat, lon, radious, state):
    restObjectList = list()
    f = open('restaurant.json', 'r')
    for line in f:
        restObject = json.loads(line)
        if restObject.get("state")==state and getDistance(lat,lon,restObject.get('latitude'), restObject.get('longitude')) <= radious:
            restObjectList.append(restObject)
    f.close()
    return restObjectList

In [13]:
def getNotVisitedSet(userId,state):
    restaurants = set(userStateAllRestaurantdict.get((user,state)))
    visited = set(userRestaurantDict.get(user))
    notVisited = restaurants.difference(visited)
    return notVisited

In [14]:
states = ['AZ','BW','EDH','IL','NC','NV','OH','ON','PA','QC','WI']

In [15]:
### Method to get all visited restaurants for a user for given state

def getVisitedFromState(user_id, state):
    all_rest = set(userStateAllRestaurantdict.get((user_id, state)))
    all_visited_rest = set(userRestaurantDict.get(user_id))
    return list(set.intersection(all_rest, all_visited_rest))

### Building a hashmap of [restaurant_id, (lat,long)]

In [16]:
restaurantToLocation = dict()
f = open('restaurant.json','r')
for line in f:
    jsonObject = json.loads(line)
    restaurantToLocation[jsonObject.get('business_id')] = (jsonObject.get('latitude'), jsonObject.get('longitude'))
f.close()

### Now calculate location for all (user_id , state) key 

In [9]:
# first get all visited restaurant in a state from (user_id, state)
#  merge restaurant into one location if they are closer than threshold
#  return the location of maximum number of time visited restaurant

In [17]:
# users approx location 
# (user_id, state)--> (lat, long)
def getUserLocation(user_id, state):
    circle = 10 #threshold to merge at same location
    visitedList = getVisitedFromState(user_id, state)
    restaurantLocationToCount = dict() #hashmap of type (restaurant_id, (lat, long))----> count
    for rest_id in visitedList:
        lat, lon = restaurantToLocation.get(rest_id)
        if len(restaurantLocationToCount) == 0:
            restaurantLocationToCount[(rest_id, (lat, lon) )] = 1
        else:
            merged = False
            for key in restaurantLocationToCount.keys():
                curr_id = key[0]
                curr_lat = key[1][0]
                curr_lon= key[1][1]
                if getDistance(lat, lon, curr_lat, curr_lon) <= circle:
                    restaurantLocationToCount[key] = restaurantLocationToCount.get(key) + 1
                    merged = True
                    break;
            if merged == False:
                restaurantLocationToCount[(rest_id, (lat, lon))] = 1
    maxTuple = max(restaurantLocationToCount, key=restaurantLocationToCount.get) #getting key with max value
    return (maxTuple[1][0],maxTuple[1][1])

In [25]:
print getDistance(43.6482347, -79.3795255, 43.6486362, -79.3817439)

0.18404750552


In [12]:
i = 0
for key in userStateAllRestaurantdict.keys():
    print key,"---->",len(userStateAllRestaurantdict.get(key))
    i += 1
    if i>15:
        break

(u'VLPzREDi_2R0up9dsUct4g', u'NV') ----> 11356
(u'ChTuYk2vSUSQP2H0mLfpEw', u'PA') ----> 4977
(u'eTxVsgUSujTixeZzeR3lJA', u'PA') ----> 4977
(u'qByqJ5q-lo49ohmp9ihsQg', u'IL') ----> 883
(u'zN0yzKhNWULR97tY1nWa8Q', u'WI') ----> 2146
(u'z9JX-It3L3bAkAcjcMsp5A', u'NV') ----> 11356
(u'lEjhjCk3pU64NxgG9YwDMA', u'NV') ----> 11356
(u'JzFUW28qVKJFE9hF8V3MCw', u'NV') ----> 11356
(u'Tk2D_t3PregIambxbil5oA', u'AZ') ----> 16056
(u'4vSwpL6XSvOaJWPT8nyYQg', u'NV') ----> 11356
(u'w6O5aD4F44puF9yS3QBOvQ', u'ON') ----> 17318
(u'm8M1HCzUAGyE7C1T8951Lw', u'AZ') ----> 16056
(u'DhiUUkjSeVdTL8WalNLg6Q', u'ON') ----> 17318
(u'RTi-Sovy8Tm7PJB735m_0w', u'ON') ----> 17318
(u'uPdQLnVf0wDwNW4ScHnA2Q', u'QC') ----> 5970
(u'TF9eVTIQm--1VIQcf9CU2w', u'NV') ----> 11356


In [13]:
i = 0
for key in userRestaurantDict.keys():
    print key,"---->",len(userRestaurantDict.get(key))
    i += 1
    if i>15:
        break

kiQv0q-oOVGEOPa_S_at6w ----> 4
an8MLCN3dYT8J6eSaszWlw ----> 1
TSFDRxY6Fvb8o9ylq-9Jzg ----> 2
Bu1-Nm7zPbWRf14RSCIi6Q ----> 1
yff3Loo9L9gcEDDRxzruTQ ----> 1
509Lnm5epqje7Y5sVk2wMA ----> 1
UB0bwz4k6H-viCSPxl0Y7w ----> 1
jYl1z1AtPuHsrBr8_Twulw ----> 1
LRE39Mt3_FYh1rdlGMvuRg ----> 1
nKYEwQj3ZN9KnISsNyVyTQ ----> 3
BI2D65VWzPWKz4NerdihLg ----> 1
_n9-w5eZJTabiIjvIWkmVQ ----> 2
w2ntisggCzwnu_EuCIZz2g ----> 3
LHSt755HRUHyPY8dn_IdBg ----> 1
j-Goq6j-wqg5rx4BBf8N2g ----> 4
sKi3lekXQFUIOmMKeamcNg ----> 1


In [14]:
f = open('restaurant.json','r')
print f.readline()
f.close()

{"business_id": "mLwM-h2YhXl2NCgdS84_Bw", "name": "South Florida Style Chicken & Ribs", "neighborhood": "Eastland", "address": "2824 Milton Rd", "city": "Charlotte", "state": "NC", "postal_code": "28215", "latitude": 35.23687, "longitude": -80.7419759, "stars": 4.5, "review_count": 4, "is_open": 0, "attributes": {"GoodForMeal": {"dessert": false, "latenight": false, "lunch": false, "dinner": false, "breakfast": false, "brunch": false}, "HasTV": false, "RestaurantsGoodForGroups": true, "NoiseLevel": "average", "RestaurantsAttire": "casual", "RestaurantsReservations": false, "OutdoorSeating": false, "BusinessAcceptsCreditCards": false, "RestaurantsPriceRange2": 2, "RestaurantsDelivery": true, "Ambience": {"romantic": false, "intimate": false, "classy": false, "hipster": false, "divey": false, "touristy": false, "trendy": false, "upscale": false, "casual": false}, "RestaurantsTakeOut": true, "GoodForKids": true}, "categories": ["Food", "Soul Food", "Convenience Stores", "Restaurants"], "h

### User Data Frame

In [15]:
restDf = pd.read_csv('resFeatureDF.csv')

In [16]:
restDf.head()

Unnamed: 0.1,Unnamed: 0,restaurantscounterservice,lunch,cuba,wednesday,street,haiti,coloring,asian,cambodia,...,valet,poland,vietnam,caters,wifi,upscale,karaoke,jukebox,casual,peru
0,wJ-961JWdVhJXhWQf4Jlcw,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,bPDfUbhr5Jab_fPTNTn9oQ,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,9f68hfHK6gNyHWmt9guiJw,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,hXWUEd7I7tmbXdQsfYY5tA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Xe0x73pcUQtVzowF1RC0UA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
(restDf != 0).sum()

Unnamed: 0                   75911
restaurantscounterservice      246
lunch                        17831
cuba                            66
wednesday                     1077
street                       11727
haiti                            9
coloring                         7
asian                          129
cambodia                        33
ireland                        209
acceptsinsurance                52
gluten-free                     24
thailand                      1291
restaurantsgoodforgroups     42734
venezuela                       20
garage                        3093
canada                        1357
latin                          473
byob                            48
caribbean                      509
coatcheck                      860
lot                          22565
drivethru                     2352
sunday                        1181
puertorico                      19
america                       2648
saturday                      4250
portugal            

In [18]:
features = restDf.columns

In [19]:
for feature in features:
    print feature,"--->", ( restDf[feature] != 0 ).sum()

Unnamed: 0 ---> 75911
restaurantscounterservice ---> 246
lunch ---> 17831
cuba ---> 66
wednesday ---> 1077
street ---> 11727
haiti ---> 9
coloring ---> 7
asian ---> 129
cambodia ---> 33
ireland ---> 209
acceptsinsurance ---> 52
gluten-free ---> 24
thailand ---> 1291
restaurantsgoodforgroups ---> 42734
venezuela ---> 20
garage ---> 3093
canada ---> 1357
latin ---> 473
byob ---> 48
caribbean ---> 509
coatcheck ---> 860
lot ---> 22565
drivethru ---> 2352
sunday ---> 1181
puertorico ---> 19
america ---> 2648
saturday ---> 4250
portugal ---> 225
czech ---> 6
taiwan ---> 235
germany ---> 652
belgium ---> 45
dj ---> 1390
drinks ---> 14315
no_music ---> 0
soy-free ---> 6
restaurantspricerange2 ---> 66128
vegan ---> 121
vegetarian ---> 72
validated ---> 297
hawai ---> 249
divey ---> 1085
agesallowed ---> 261
bar ---> 17998
kosher ---> 4
hipster ---> 952
open24hours ---> 28
perms ---> 4
extensions ---> 6
restaurantsdelivery ---> 10880
mexico ---> 4042
greece ---> 956
curly ---> 5
goodfordancing 

In [20]:
drop_feature_list = ['coloring','honduras','gluten-free','no_music','soy-free','kosher','perms','extensions',
                    'curly','russia','byob','kids','africanamerican','halal','uzbekistan','dairy-free','straightperms']


In [21]:
restDf = restDf.drop(drop_feature_list, axis = 1)

In [22]:
restDf.shape

(75911, 113)

In [23]:
features = restDf.columns

In [24]:
for feature in features:
    print feature,"--->", ( restDf[feature] != 0 ).sum()

Unnamed: 0 ---> 75911
restaurantscounterservice ---> 246
lunch ---> 17831
cuba ---> 66
wednesday ---> 1077
street ---> 11727
haiti ---> 9
asian ---> 129
cambodia ---> 33
ireland ---> 209
acceptsinsurance ---> 52
thailand ---> 1291
restaurantsgoodforgroups ---> 42734
venezuela ---> 20
garage ---> 3093
canada ---> 1357
latin ---> 473
caribbean ---> 509
coatcheck ---> 860
lot ---> 22565
drivethru ---> 2352
sunday ---> 1181
puertorico ---> 19
america ---> 2648
saturday ---> 4250
portugal ---> 225
czech ---> 6
taiwan ---> 235
germany ---> 652
belgium ---> 45
dj ---> 1390
drinks ---> 14315
restaurantspricerange2 ---> 66128
vegan ---> 121
vegetarian ---> 72
validated ---> 297
hawai ---> 249
divey ---> 1085
agesallowed ---> 261
bar ---> 17998
hipster ---> 952
open24hours ---> 28
restaurantsdelivery ---> 10880
mexico ---> 4042
greece ---> 956
goodfordancing ---> 1725
corkage ---> 140
alcohol ---> 43594
spain ---> 795
happyhour ---> 5761
intimate ---> 567
scandinavian ---> 13
live ---> 1190
busi

In [25]:
south_america = ['cuba','haiti','venezuela','colombia','caribbean','puertorico','hawai','peru','latin']

In [26]:
restDf['south_america'] = np.where(restDf['cuba'] | restDf['peru'] | restDf['haiti'] |restDf['venezuela'] |
                               restDf['colombia'] |restDf['caribbean'] 
                                   |restDf['puertorico'] |restDf['hawai'] | restDf['latin'] == 1,1,0 )


In [27]:
features = restDf.columns
for feature in features:
    print feature,"--->", ( restDf[feature] != 0 ).sum()

Unnamed: 0 ---> 75911
restaurantscounterservice ---> 246
lunch ---> 17831
cuba ---> 66
wednesday ---> 1077
street ---> 11727
haiti ---> 9
asian ---> 129
cambodia ---> 33
ireland ---> 209
acceptsinsurance ---> 52
thailand ---> 1291
restaurantsgoodforgroups ---> 42734
venezuela ---> 20
garage ---> 3093
canada ---> 1357
latin ---> 473
caribbean ---> 509
coatcheck ---> 860
lot ---> 22565
drivethru ---> 2352
sunday ---> 1181
puertorico ---> 19
america ---> 2648
saturday ---> 4250
portugal ---> 225
czech ---> 6
taiwan ---> 235
germany ---> 652
belgium ---> 45
dj ---> 1390
drinks ---> 14315
restaurantspricerange2 ---> 66128
vegan ---> 121
vegetarian ---> 72
validated ---> 297
hawai ---> 249
divey ---> 1085
agesallowed ---> 261
bar ---> 17998
hipster ---> 952
open24hours ---> 28
restaurantsdelivery ---> 10880
mexico ---> 4042
greece ---> 956
goodfordancing ---> 1725
corkage ---> 140
alcohol ---> 43594
spain ---> 795
happyhour ---> 5761
intimate ---> 567
scandinavian ---> 13
live ---> 1190
busi

In [28]:
restDf = restDf.drop(south_america, axis = 1)

In [29]:
restDf.head()

Unnamed: 0.1,Unnamed: 0,restaurantscounterservice,lunch,wednesday,street,asian,cambodia,ireland,acceptsinsurance,thailand,...,valet,poland,vietnam,caters,wifi,upscale,karaoke,jukebox,casual,south_america
0,wJ-961JWdVhJXhWQf4Jlcw,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,bPDfUbhr5Jab_fPTNTn9oQ,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,9f68hfHK6gNyHWmt9guiJw,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,hXWUEd7I7tmbXdQsfYY5tA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Xe0x73pcUQtVzowF1RC0UA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
restDf.shape

(75911, 105)

In [31]:
features = restDf.columns
for feature in features:
    print feature,"--->", ( restDf[feature] != 0 ).sum()

Unnamed: 0 ---> 75911
restaurantscounterservice ---> 246
lunch ---> 17831
wednesday ---> 1077
street ---> 11727
asian ---> 129
cambodia ---> 33
ireland ---> 209
acceptsinsurance ---> 52
thailand ---> 1291
restaurantsgoodforgroups ---> 42734
garage ---> 3093
canada ---> 1357
coatcheck ---> 860
lot ---> 22565
drivethru ---> 2352
sunday ---> 1181
america ---> 2648
saturday ---> 4250
portugal ---> 225
czech ---> 6
taiwan ---> 235
germany ---> 652
belgium ---> 45
dj ---> 1390
drinks ---> 14315
restaurantspricerange2 ---> 66128
vegan ---> 121
vegetarian ---> 72
validated ---> 297
divey ---> 1085
agesallowed ---> 261
bar ---> 17998
hipster ---> 952
open24hours ---> 28
restaurantsdelivery ---> 10880
mexico ---> 4042
greece ---> 956
goodfordancing ---> 1725
corkage ---> 140
alcohol ---> 43594
spain ---> 795
happyhour ---> 5761
intimate ---> 567
scandinavian ---> 13
live ---> 1190
businessacceptscreditcards ---> 61629
background_music ---> 1784
dinner ---> 15304
hastv ---> 22330
goodforkids --->

In [32]:
indian_sub = ['india','pakistan','nepal']
rest_asia = ['asian','cambodia','thailand','malaysia','laos','vietnam']
china_nearby = ['taiwan','china','korea','japan']

In [33]:
restDf['indian_sub'] = np.where(restDf['india'] | restDf['pakistan'] | restDf['nepal'] == 1,1,0 )
restDf['rest_asia'] = np.where(restDf['asian'] | restDf['cambodia'] | restDf['thailand'] |restDf['malaysia'] |
                               restDf['laos'] |restDf['vietnam']== 1,1,0 )
restDf['china_nearby'] = np.where(restDf['taiwan'] | restDf['china'] | restDf['korea'] |restDf['japan']== 1,1,0 )


In [34]:
restDf = restDf.drop(indian_sub, axis = 1)
restDf = restDf.drop(rest_asia, axis = 1)
restDf = restDf.drop(china_nearby, axis = 1)

In [35]:
restDf.shape

(75911, 95)

In [36]:
features = restDf.columns
for feature in features:
    print feature,"--->", ( restDf[feature] != 0 ).sum()

Unnamed: 0 ---> 75911
restaurantscounterservice ---> 246
lunch ---> 17831
wednesday ---> 1077
street ---> 11727
ireland ---> 209
acceptsinsurance ---> 52
restaurantsgoodforgroups ---> 42734
garage ---> 3093
canada ---> 1357
coatcheck ---> 860
lot ---> 22565
drivethru ---> 2352
sunday ---> 1181
america ---> 2648
saturday ---> 4250
portugal ---> 225
czech ---> 6
germany ---> 652
belgium ---> 45
dj ---> 1390
drinks ---> 14315
restaurantspricerange2 ---> 66128
vegan ---> 121
vegetarian ---> 72
validated ---> 297
divey ---> 1085
agesallowed ---> 261
bar ---> 17998
hipster ---> 952
open24hours ---> 28
restaurantsdelivery ---> 10880
mexico ---> 4042
greece ---> 956
goodfordancing ---> 1725
corkage ---> 140
alcohol ---> 43594
spain ---> 795
happyhour ---> 5761
intimate ---> 567
scandinavian ---> 13
live ---> 1190
businessacceptscreditcards ---> 61629
background_music ---> 1784
dinner ---> 15304
hastv ---> 22330
goodforkids ---> 38752
smoking ---> 7111
ukraine ---> 17
restaurantsattire ---> 448

In [37]:
rest_europe = ['ireland','portugal','czech','germany','belgium','greece','spain','scandinavian','ukraine','hungary','france','england','switzerland','poland']

In [38]:
restDf['rest_europe'] = np.where(restDf['ireland'] | restDf['portugal'] | restDf['czech'] |restDf['germany'] |
                               restDf['belgium'] |restDf['greece'] 
                                   |restDf['spain'] |restDf['scandinavian'] | restDf['ukraine'] |
                                 restDf['hungary'] |restDf['france'] 
                                   |restDf['england'] |restDf['switzerland'] | restDf['poland'] == 1,1,0 )


In [39]:
restDf = restDf.drop(rest_europe, axis = 1)

In [40]:
restDf.shape

(75911, 82)

In [41]:
features = restDf.columns
for feature in features:
    print feature,"--->", ( restDf[feature] != 0 ).sum()

Unnamed: 0 ---> 75911
restaurantscounterservice ---> 246
lunch ---> 17831
wednesday ---> 1077
street ---> 11727
acceptsinsurance ---> 52
restaurantsgoodforgroups ---> 42734
garage ---> 3093
canada ---> 1357
coatcheck ---> 860
lot ---> 22565
drivethru ---> 2352
sunday ---> 1181
america ---> 2648
saturday ---> 4250
dj ---> 1390
drinks ---> 14315
restaurantspricerange2 ---> 66128
vegan ---> 121
vegetarian ---> 72
validated ---> 297
divey ---> 1085
agesallowed ---> 261
bar ---> 17998
hipster ---> 952
open24hours ---> 28
restaurantsdelivery ---> 10880
mexico ---> 4042
goodfordancing ---> 1725
corkage ---> 140
alcohol ---> 43594
happyhour ---> 5761
intimate ---> 567
live ---> 1190
businessacceptscreditcards ---> 61629
background_music ---> 1784
dinner ---> 15304
hastv ---> 22330
goodforkids ---> 38752
smoking ---> 7111
restaurantsattire ---> 44801
byappointmentonly ---> 528
video ---> 190
friday ---> 4160
noiselevel ---> 40114
classy ---> 899
middleeast ---> 2344
res ---> 74347
tuesday ---> 

In [35]:
feature_drop_list_days = ['monday','tuesday','wednesday','thursday','friday','saturday','sunday']
restDf = restDf.drop(feature_drop_list_days, axis = 1)

NameError: name 'restDf' is not defined

In [43]:
parking_drop_list = ["garage",'street','validated','lot','valet']


In [44]:
restDf['has_parking'] = np.where(restDf['garage'] | restDf['street'] | restDf['validated'] |restDf['lot'] |
                               restDf['valet'] == 1,1,0 )

In [45]:
restDf = restDf.drop(parking_drop_list, axis = 1)

In [46]:
features = restDf.columns
for feature in features:
    print feature,"--->", ( restDf[feature] != 0 ).sum()

Unnamed: 0 ---> 75911
restaurantscounterservice ---> 246
lunch ---> 17831
acceptsinsurance ---> 52
restaurantsgoodforgroups ---> 42734
canada ---> 1357
coatcheck ---> 860
drivethru ---> 2352
america ---> 2648
dj ---> 1390
drinks ---> 14315
restaurantspricerange2 ---> 66128
vegan ---> 121
vegetarian ---> 72
divey ---> 1085
agesallowed ---> 261
bar ---> 17998
hipster ---> 952
open24hours ---> 28
restaurantsdelivery ---> 10880
mexico ---> 4042
goodfordancing ---> 1725
corkage ---> 140
alcohol ---> 43594
happyhour ---> 5761
intimate ---> 567
live ---> 1190
businessacceptscreditcards ---> 61629
background_music ---> 1784
dinner ---> 15304
hastv ---> 22330
goodforkids ---> 38752
smoking ---> 7111
restaurantsattire ---> 44801
byappointmentonly ---> 528
video ---> 190
noiselevel ---> 40114
classy ---> 899
middleeast ---> 2344
res ---> 74347
bikeparking ---> 37092
brunch ---> 3552
outdoorseating ---> 20448
restaurantsreservations ---> 18498
latenight ---> 2478
wheelchairaccessible ---> 24744
to

In [47]:
restDf.shape


(75911, 71)

In [48]:
music_drop_list = ["dj",'background_music','karaoke','live','video','jukebox']
restDf['has_music'] = np.where(restDf['dj'] | restDf['background_music'] | restDf['karaoke'] |restDf['live'] |
                               restDf['video'] | restDf['jukebox'] == 1,1,0 )

restDf = restDf.drop(music_drop_list, axis = 1)

In [49]:
features = restDf.columns
for feature in features:
    print feature,"--->", ( restDf[feature] != 0 ).sum()

Unnamed: 0 ---> 75911
restaurantscounterservice ---> 246
lunch ---> 17831
acceptsinsurance ---> 52
restaurantsgoodforgroups ---> 42734
canada ---> 1357
coatcheck ---> 860
drivethru ---> 2352
america ---> 2648
drinks ---> 14315
restaurantspricerange2 ---> 66128
vegan ---> 121
vegetarian ---> 72
divey ---> 1085
agesallowed ---> 261
bar ---> 17998
hipster ---> 952
open24hours ---> 28
restaurantsdelivery ---> 10880
mexico ---> 4042
goodfordancing ---> 1725
corkage ---> 140
alcohol ---> 43594
happyhour ---> 5761
intimate ---> 567
businessacceptscreditcards ---> 61629
dinner ---> 15304
hastv ---> 22330
goodforkids ---> 38752
smoking ---> 7111
restaurantsattire ---> 44801
byappointmentonly ---> 528
noiselevel ---> 40114
classy ---> 899
middleeast ---> 2344
res ---> 74347
bikeparking ---> 37092
brunch ---> 3552
outdoorseating ---> 20448
restaurantsreservations ---> 18498
latenight ---> 2478
wheelchairaccessible ---> 24744
touristy ---> 211
italy ---> 4527
romantic ---> 518
dogsallowed ---> 205

In [50]:
restDf.shape

(75911, 66)

In [52]:
drop_list_threshold1 = ['restaurantscounterservice','acceptsinsurance','coatcheck','vegan','vegetarian',
                        'agesallowed','open24hours','corkage','intimate','byappointmentonly','touristy',
                        'romantic','businessacceptsbitcoin','upscale','ethiopia','africa']
restDf = restDf.drop(drop_list_threshold1, axis = 1)

In [53]:
combine_canada_america = ['canada','america']
restDf['canada_america'] = np.where(restDf['canada'] | restDf['america'] == 1,1,0 )
restDf = restDf.drop(combine_canada_america, axis = 1)

In [54]:
restDf.shape

(75911, 49)

## Analysis of collaborative filtering

In [None]:
userid 				 business_id 			 predicted_rating 	 actual_rating

aiXFgj1oU-I3idtifgLRpA 		9xTx4vWlL2kkhUqNE4QaaQ 		4.94454922081 		4.5
aiXFgj1oU-I3idtifgLRpA 		aRvCyCpwxgIqjXSSZ8pt3g 		4.93775708087 		5.0
aiXFgj1oU-I3idtifgLRpA 		8XVAcOZ02IGTQE-pvKhXXw 		4.89796142416 		5.0
aiXFgj1oU-I3idtifgLRpA 		ixfpsy7M6vLAe0Xf-EWH4g 		4.89510587114 		5.0
aiXFgj1oU-I3idtifgLRpA 		WSZL9uQ9JMOrrulMKEl7Tw 		4.87333192135 		4.5
aiXFgj1oU-I3idtifgLRpA 		KPSaUzlYg4JLVbzrbA23uw 		4.86978506394 		4.5

In [134]:
test_user_id = 'aiXFgj1oU-I3idtifgLRpA'
f = open('user.json','r')
for line in f:
    jsonObject = json.loads(line)
    if jsonObject.get('user_id') == test_user_id:
        for attribute, value in jsonObject.iteritems():
            print attribute,"---->",value # example usage
        break;
f.close()

yelping_since ----> 2006-12-29
useful ----> 360
compliment_photos ----> 1
compliment_list ----> 0
compliment_funny ----> 5
funny ----> 45
review_count ----> 87
friends ----> [u'wA7QXD9tzUoUWNdwJROlGQ', u'JnA41k0eYeauhb3r3qGlNA', u'dBDNhT3QrMk0AkD-qa_hhw', u'V3O6KszIei-jTAKn9ulPag', u'bWk7c4oFdOVEhVe6FxkIBQ', u'JbK4QNcWCG_7izzW7rinzw', u'w2WTgiok2JJkZaApCw9FCg', u'_rAj8TYvXRH69ymOaNtFMg', u'L9eVgCM-e1lhrwcs82fd3g', u'fjTqua61Lh1PJs_F29IUoQ', u'ucal95umwk86-U8hdAkVpQ', u'8zWLCcLb1jCg27ibbQd-TQ', u'ZMZ8BtpBpCsJKfPvGvCvHg', u'iNVyzq9-Uu5e5ROiBb90nA', u'MwbVIWQNAtO6ZUVj0XCYZQ', u'C4NqiRTC0gDy8OoUfecVYQ', u'WKSY5_w7PhdX0XK0itKuRQ', u'lmUNha6pP99LTjCz8S0uGA', u'1562m-MUmax5bXW-M2c6HA', u'lvNm9qFeWw99OlTVyqUJmw', u'lxr9ilkkwH8rqMMBxttpmg', u'lOcY9Vlv0lYHies660r2Lg', u'eV2Ny_84E0Fp4z-QliRJrg', u'C_l-GUxlJJzfQGYdSeENzQ', u'THRw0P2cpCSOuUwBVthEtQ', u'l8gUDVwVvlHt1fAZw8YU7A', u'7Efjs3sc_rBX4DSmgIueSg', u'vcaeibpNGH2llQBXIo12Hg', u'9-n--WRL-gTBF6uNr0hHrQ', u'3lVvwcC9Jkxqmph66gUV5w', u'geVD5khJ-Ze77

In [145]:
test_user_id = 'aiXFgj1oU-I3idtifgLRpA'
visited_rest_list = userRestaurantDict.get(test_user_id)
visited_restaurant_objects = list()

text_file = open("test_user_visited_rests.txt", "w")
f = open('restaurant.json','r')
for line in f:
    jsonObject = json.loads(line)
    if jsonObject.get('business_id') in visited_rest_list:
        visited_restaurant_objects.append(jsonObject)
        text_file.write(str(jsonObject)+"\n")
text_file.close()

In [136]:
rest_list = ['9xTx4vWlL2kkhUqNE4QaaQ','aRvCyCpwxgIqjXSSZ8pt3g','8XVAcOZ02IGTQE-pvKhXXw',
            'ixfpsy7M6vLAe0Xf-EWH4g','WSZL9uQ9JMOrrulMKEl7Tw','KPSaUzlYg4JLVbzrbA23uw']
f = open('restaurant.json','r')
for line in f:
    jsonObject = json.loads(line)
    if jsonObject.get('business_id') in rest_list:
        for attribute, value in jsonObject.iteritems():
            print attribute,"---->",value
        print "==========================================================="
f.close()

city ----> Toronto
neighborhood ----> Seaton Village
name ----> Volta Espresso
business_id ----> aRvCyCpwxgIqjXSSZ8pt3g
longitude ----> -79.4120512
hours ----> {u'Monday': u'7:00-17:00', u'Tuesday': u'7:00-17:00', u'Friday': u'7:00-17:00', u'Wednesday': u'7:00-17:00', u'Thursday': u'7:00-17:00', u'Sunday': u'7:00-17:00', u'Saturday': u'7:00-17:00'}
state ----> ON
postal_code ----> M5R 3G3
categories ----> [u'Restaurants', u'Cafes', u'Coffee & Tea', u'Food']
stars ----> 5.0
address ----> 866 Bathurst Street
latitude ----> 43.6669766
review_count ----> 25
attributes ----> {u'GoodForMeal': {u'dessert': False, u'latenight': False, u'lunch': False, u'dinner': False, u'brunch': False, u'breakfast': False}, u'BusinessParking': {u'garage': False, u'street': True, u'validated': False, u'lot': False, u'valet': False}, u'HasTV': False, u'GoodForKids': True, u'WiFi': u'free', u'RestaurantsAttire': u'casual', u'RestaurantsReservations': False, u'OutdoorSeating': True, u'RestaurantsPriceRange2': 1, 

In [135]:
f = open('restaurant.json','r')
print f.readline()
f.close()

{"business_id": "mLwM-h2YhXl2NCgdS84_Bw", "name": "South Florida Style Chicken & Ribs", "neighborhood": "Eastland", "address": "2824 Milton Rd", "city": "Charlotte", "state": "NC", "postal_code": "28215", "latitude": 35.23687, "longitude": -80.7419759, "stars": 4.5, "review_count": 4, "is_open": 0, "attributes": {"GoodForMeal": {"dessert": false, "latenight": false, "lunch": false, "dinner": false, "breakfast": false, "brunch": false}, "HasTV": false, "RestaurantsGoodForGroups": true, "NoiseLevel": "average", "RestaurantsAttire": "casual", "RestaurantsReservations": false, "OutdoorSeating": false, "BusinessAcceptsCreditCards": false, "RestaurantsPriceRange2": 2, "RestaurantsDelivery": true, "Ambience": {"romantic": false, "intimate": false, "classy": false, "hipster": false, "divey": false, "touristy": false, "trendy": false, "upscale": false, "casual": false}, "RestaurantsTakeOut": true, "GoodForKids": true}, "categories": ["Food", "Soul Food", "Convenience Stores", "Restaurants"], "h

In [147]:
def extract_star(json):
    try:
        # Also convert to int since update_time will be string.  When comparing
        # strings, "10" is smaller than "2".
        return int(json.get('stars'))
    except KeyError:
        return 0

# lines.sort() is more efficient than lines = lines.sorted()
visited_restaurant_objects.sort(key=extract_star, reverse=True)

In [148]:
text_file = open("test_user_visited_sorted_stars.txt", "w")
for jsonObject in visited_restaurant_objects:
    text_file.write(str(jsonObject)+"\n\n\n")
text_file.close()

## list of reviews made by test_user

In [149]:
review_obj_list = list()
count = 0
f = open('review.json','r')
for line in f:
    jsonObject = json.loads(line)
    if jsonObject.get('user_id') == test_user_id:
        review_obj_list.append(jsonObject)
f.close()

In [151]:
review_obj_list.sort(key=extract_star, reverse=True)

In [152]:
text_file = open("reviews_by_test_user.txt", "w")
for jsonObject in review_obj_list:
    text_file.write(str(jsonObject)+"\n\n\n")
text_file.close()