# Create a CSV file for business features using web code

In [104]:
import os
import sys
import numpy as np
import json
import matplotlib.pylab as plt
import pandas as pd
import getpass
%matplotlib inline 
sys.path.append("..") # go to parent dir
import Code.DataReviewFunctions as DataReviewUtils
import Code.DataCollect as DataCollectUtils
from __future__ import print_function
from tqdm import tqdm
import unicodecsv as csv
import ast
from collections import defaultdict
from datetime import datetime

Path Locations

In [105]:
username = getpass.getuser()
if username == 'sdekel':
    data_path = 'C:\DataSets\yelp_dataset\dataset_restaurants'
    data_path_to_save = 'C:\DataSets\yelp_dataset\processed_data'
elif username == 'zahi':
    data_path = 'D:\\users\\zahi\\OneDrive - Bar-Ilan University\\yelp_dataset_challenge_round9\\yelp2'
    data_path_to_save = '' ##TODO: Add your path for the processed data
else:
    data_path = ''  ## TODO: PUT YOUR DATA PATH HERE

Dataset Names

In [106]:
business_data_name = 'yelp_academic_dataset_business_restaurants.json'
business_data_name_to_save = 'yelp_academic_dataset_business_restaurants.csv'

In [107]:
business_fullpath = os.path.join( data_path, business_data_name )
business_fullpath_to_save = os.path.join( data_path_to_save, business_data_name_to_save )

In [108]:
data_business = pd.read_json(business_fullpath, lines=True )

In [109]:
data_business.columns

Index(['address', 'attributes', 'business_id', 'categories', 'city', 'hours',
       'is_open', 'latitude', 'longitude', 'name', 'neighborhood',
       'postal_code', 'review_count', 'stars', 'state', 'type'],
      dtype='object')

In [110]:
data_business.head(2)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,type
0,979 Bloor Street W,"[Alcohol: none, Ambience: {'romantic': False, ...",EDqCEAGXVGCH4FJXgqtjqg,"[Restaurants, Pizza, Chicken Wings, Italian]",Toronto,"[Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ...",1,43.661054,-79.429089,Pizza Pizza,Dufferin Grove,M6H 1L5,7,2.5,ON,business
1,11072 No Frank Lloyd Wright,"[Alcohol: none, Ambience: {'romantic': False, ...",GDnbt3isfhd57T1QqU6flg,"[Tex-Mex, Mexican, Fast Food, Restaurants]",Scottsdale,"[Monday 10:0-22:0, Tuesday 10:0-22:0, Wednesda...",1,33.58671,-111.83541,Taco Bell,,85259,9,2.5,AZ,business


In [111]:
print("Reading data from JSON...")
data = list()
with open(business_fullpath,encoding="utf8") as f:
    for line in f:
        data.append(json.loads(line))

Reading data from JSON...


In [112]:
print("Calculating all categories represented in the data...")
all_categories = []
for row in data:
    if row['categories']:
        all_categories += row['categories']
all_categories = set(all_categories)
print( 'Number of categories %d' % len(all_categories))

Calculating all categories represented in the data...
Number of categories 607


In [113]:
print("Calculating all attributes represented in the data...")
all_attributes = []
attributes_with_subattributes = defaultdict(lambda: set())
for row in data:
    if row['attributes']:
        for attribute_string in row['attributes']:
            attribute = attribute_string.split(':')[0]
            value = ':'.join(attribute_string.split(':')[1:])
            value = value.strip()
            all_attributes.append(attribute)
            if '{' in value:
                attribute_data = ast.literal_eval(value)
                for subattribute, v in attribute_data.items():
                    attributes_with_subattributes[attribute].add(subattribute)                
all_attributes = set(all_attributes)
print("Found these attributes with sub-attributes:")
for attribute, subattributes in attributes_with_subattributes.items():
    print('%s: %s' % (attribute, ', '.join(subattributes)))

Calculating all attributes represented in the data...
Found these attributes with sub-attributes:
Ambience: romantic, divey, classy, intimate, touristy, trendy, upscale, hipster, casual
BusinessParking: lot, garage, validated, valet, street
GoodForMeal: latenight, dessert, dinner, brunch, breakfast, lunch
BestNights: sunday, monday, saturday, wednesday, tuesday, friday, thursday
Music: background_music, no_music, video, jukebox, live, karaoke, dj
DietaryRestrictions: kosher, halal, vegan, gluten-free, dairy-free, vegetarian, soy-free
HairSpecializesIn: perms, africanamerican, asian, coloring, kids, straightperms, extensions, curly


In [114]:
print("Writing attributes to file...")
with open('yelp_academic_dataset_business-attributes.txt', 'w') as f:
    f.write('\n'.join(sorted(list(all_attributes))))

print("Writing categories to file...")
with open('yelp_academic_dataset_business-categories.txt', 'w') as f:
    f.write('\n'.join(sorted(list(all_categories))))

Writing attributes to file...
Writing categories to file...


In [115]:
# differnt between times example
def DifferntBetweenTimes(data_in):
    time_str = data_in['hours'][1][0].split( ' ')[1]
    time_splt = time_str.split('-')
    s0 = time_splt[0]
    s1 = time_splt[1]
    FMT = '%H:%M'
    tdelta = datetime.strptime(s1, FMT) - datetime.strptime(s0, FMT)
    return tdelta

tdelta = DifferntBetweenTimes(data_business)
print('time1: %s , time2: %s, delta = %f' % (s1,s0, tdelta.seconds/3600.0))

time1: 20:0 , time2: 11:0, delta = 12.000000


In [116]:
print("Processing data...")
new_data = list()
FMT = '%H:%M'
for row in tqdm(data):
    new_row = {}
    for k, v in row.items():
        if k not in ['hours', 'categories', 'attributes']:
            new_row[k] = v
        else:
            if k == 'hours' and row['hours']:
                for hour in row['hours']:
                    day, time_string = hour.split(' ')
                    time_splt = time_string.split('-')
                    s0 = time_splt[0]
                    s1 = time_splt[1]  
                    tdelta = datetime.strptime(s1, FMT) - datetime.strptime(s0, FMT)
                    new_row['hours_' + day] = tdelta.seconds/3600.0
            elif k == 'attributes' and row['attributes']:
                attributes = defaultdict(lambda: dict())
                for attribute_string in row['attributes']:
                    attribute = attribute_string.split(':')[0]
                    value = ':'.join(attribute_string.split(':')[1:])
                    value = value.strip()
                    if '{' in value:
                        data = ast.literal_eval(value)
                        for subattribute, subattribute_value in data.items():
                            attributes[attribute][subattribute] = subattribute_value
                    else:
                        attributes[attribute] = value
                for attribute in all_attributes:
                    if attribute in attributes_with_subattributes:
                        for subattribute in attributes_with_subattributes[attribute]:
                            subattributes = attributes.get(attribute, dict())
                            new_row['attribute_' + attribute + '_' + subattribute] = subattributes.get(subattribute, False)
                    else:
                        new_row['attribute_' + attribute] = attributes.get(attribute, False)
            elif k == 'categories' and row['categories']:
                new_row['categories'] = ', '.join(sorted(row['categories']))
#                for category in all_categories:
#                    new_row['category_' + category] = (category in row['categories'])
    new_data.append(new_row)

Processing data...


100%|██████████████████████████████████| 48485/48485 [00:16<00:00, 3011.49it/s]


In [117]:
print("Writing data to CSV file...")
with open(business_fullpath_to_save, 'wb') as f:
    fieldnames = sorted(new_data[0].keys())
    dw = csv.DictWriter(f, fieldnames=fieldnames, encoding='UTF-8')
    dw.writeheader()
    dw.writerows(new_data)

Writing data to CSV file...


In [118]:
business_data = pd.read_csv(business_fullpath_to_save)

In [119]:
business_data.head()

Unnamed: 0,address,attribute_AcceptsInsurance,attribute_AgesAllowed,attribute_Alcohol,attribute_Ambience_casual,attribute_Ambience_classy,attribute_Ambience_divey,attribute_Ambience_hipster,attribute_Ambience_intimate,attribute_Ambience_romantic,...,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,type
0,979 Bloor Street W,False,False,none,False,False,False,False,False,False,...,1,43.661054,-79.429089,Pizza Pizza,Dufferin Grove,M6H 1L5,7,2.5,ON,business
1,11072 No Frank Lloyd Wright,False,False,none,False,False,False,False,False,False,...,1,33.58671,-111.83541,Taco Bell,,85259,9,2.5,AZ,business
2,"1500 N Green Valley Pkwy, Ste 230",False,False,none,True,False,False,False,False,False,...,0,36.029596,-115.085821,Ohana Hawaiian BBQ,,89074,38,4.0,NV,business
3,"1052 Lionel-Daunais, Suite 302",False,False,False,False,False,False,False,False,False,...,1,45.590227,-73.430235,Chez Lionel,,J4B 0B2,7,3.5,QC,business
4,"2000 Mansfield Street, Suite 104",False,False,False,False,False,False,False,False,False,...,1,45.502346,-73.573807,La Prep,Ville-Marie,H3A 2Z6,3,4.0,QC,business


In [120]:
business_data.columns.item

<bound method IndexOpsMixin.item of Index(['address', 'attribute_AcceptsInsurance', 'attribute_AgesAllowed',
       'attribute_Alcohol', 'attribute_Ambience_casual',
       'attribute_Ambience_classy', 'attribute_Ambience_divey',
       'attribute_Ambience_hipster', 'attribute_Ambience_intimate',
       'attribute_Ambience_romantic',
       ...
       'is_open', 'latitude', 'longitude', 'name', 'neighborhood',
       'postal_code', 'review_count', 'stars', 'state', 'type'],
      dtype='object', length=102)>

In [121]:
save_json = False
if save_json:
    business_data_name_to_save = 'yelp_academic_dataset_business_resteraunts_prepared.json'
    business_fullpath = os.path.join( data_path, business_data_name_to_save )
    business_data.to_json(business_fullpath, orient='split')