In [2]:
import collections
import sqlite3
import json
from datetime import datetime, timedelta
import re
import pandas as pd

Generic function that reads a json file and returns a list of lists

In [3]:
def read_json_file(file_name):
    json_data = []
    with open(file_name,'r', encoding='utf-8') as f:
        data = f.read()
        for line in data.split('\n')[:-1]:
            json_data.append(json.loads(line))
    print('%s is done...' % file_name)
    return json_data

In [28]:
business_sample = read_json_file('yz_business.json')
business_data = read_json_file('yelp_academic_dataset_business.json')

yz_business.json is done...
yelp_academic_dataset_business.json is done...


**cleanup_string:**
 replaces characters ' ()&,-/\'' or a cluster thereof with a single underscore ('_')
    
**flatten_hours:**
takes "hours" dictionary as a parameter, such as

    {"Friday": {"close": "21:00", "open": "11:00"}, 
    "Tuesday": {"close": "21:00", "open": "11:00"}, 
    "Thursday": {"close": "21:00", "open": "11:00"}, 
    "Wednesday": {"close": "21:00", "open": "11:00"}, 
    "Monday": {"close": "21:00", "open": "11:00"}}

and converts it into: a set of key / value pairs like this:

    'Friday_hours': '11:00 - 21:00', 
    'Tuesday_hours': '11:00 - 21:00', 
    'Thursday_hours': '11:00 - 21:00', 
    'Wednesday_hours': '11:00 - 21:00', 
    'Monday_hours': '11:00 - 21:00'

**Flatten_list:**
takes a list as a parameters and converts it into a set of key / value pairs with key being item of the list and value being 1.
Example:

    input: ['Fast Food', 'Restaurants']
    output : {'Fast_Food': 1, 'Restaurants': 1}
    
Note that space is replaced by underscore
When optional prefis is provided, then prefix is appended at the beginning of the list value.
Example:

    input: ['Fast Food', 'Restaurants']. 'Category'
    output : {'Category_Fast_Food': 1, 'Category_Restaurants': 1}
    
**flatten_dict:**
Takes a dictionary as a parameters and converts it into un-nested dictionary with binary values.
1. True/False values are converted to 1/0 respectively
2. When value is other than True/False, then value is appended to a key and value of a new key becomes 1
3. When value is a dictinary, then nested dictionary is flattened using logic above and key of a parent dictionary is used a s a prefix to a key of a nested dictionary

Note: characters  ' ()&,-/\'' in keys are replaced by underscore
Example of unnested dictionary:

    input: 
    {'Take-out': True, 
     'Drive-Thru': False, 
     'Caters': False, 
     'Noise Level': 'average'}
    output: 
    {'Take_out': 1, 
     'Drive_Thru': 0 
     'Caters': 0, 
     'Noise_Level_average':1}
     
Example of nested dictionary:

    input:
    {'Ambience': {
         'romantic': False, 
         'intimate': False, 
         'trendy': True, 
         'upscale': True}}
    output:
    {'Ambience_romantic': 0, 
     'Ambience_intimate': 0, 
     'Ambience_trendy': 1, 
     'Ambience_upscale': 1}
            
**prepare_business_data:**
Converts yelp business data in json into flattened dictionary (with exception of categories) using functions above


In [94]:
def cleanup_string(input_str): 
    output = ''
    special = ' ()&,-/\''
    replaced = False
    for letter in input_str:
        if letter not in special:
            output += letter
            replaced = False
        else:
            if not replaced:
                output += '_'
                replaced = True
    return output.strip('_')

def flatten_hours (hours):
    flatten = {k+'_hours': v['open']+ ' - ' + v['close'] for k, v in hours.items()}
    return flatten

def flatten_list (source_list, key_prefix=''):
    flatten = {cleanup_string(key_prefix+' '+item): 1 for item in source_list}
    return flatten

def flatten_dict (source_dict, key_prefix=''):
    flatten = {}
    for k, v in source_dict.items():
        if type(v) == type({}):
            flatten = dict(**flatten, **flatten_dict(v, key_prefix + ' ' + k))
        else:
            if str(v) in ['True', 'False']:
                flatten[cleanup_string(key_prefix + ' ' +k)] = (1 if v else 0)
            else:
                flatten[cleanup_string(key_prefix + ' ' +k + ' ' + str(v))] = 1

    return flatten
                
def prepare_business_data(data):
    
    output = [{
    'business_id': info['business_id'],
    'name': info['name'],
    'type': info['type'],
    'open': info['open'],
    'latitude': info['latitude'],
    'longitude': info['longitude'],
    'full_address': info['full_address'],
    'city': info['city'],
    'state': info['state'],
    'review_count': info['review_count'],
    'stars': info['stars'],
    'neighborhoods': info['neighborhoods'],
    'attributes': flatten_dict(info['attributes']),
    **flatten_hours(info['hours'])}
    for info in data]
    
    return output

def prepare_business_data_cats(data):
    
    output = [{
    'business_id': info['business_id'],
    ** flatten_list(info['categories'])}
    for info in data]
    
    return output

In [58]:
df_business = pd.DataFrame(prepare_business_data(business_data))
df_business.head()

Unnamed: 0,Friday_hours,Monday_hours,Saturday_hours,Sunday_hours,Thursday_hours,Tuesday_hours,Wednesday_hours,attributes,business_id,city,full_address,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,11:00 - 21:00,11:00 - 21:00,,,11:00 - 21:00,11:00 - 21:00,11:00 - 21:00,"{'Take_out': 1, 'Drive_Thru': 0, 'Good_For_des...",5UmKMjUEUNdYWqANhGckJw,Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034",40.354327,-79.900706,Mr Hoagie,[],True,7,3.5,PA,business
1,,,,,,,,"{'Happy_Hour': 1, 'Accepts_Credit_Cards': 1, '...",UsFtqoBl7naz8AVUBZMjQQ,Dravosburg,"202 McClure St\nDravosburg, PA 15034",40.350553,-79.886814,Clancy's Pub,[],True,5,3.0,PA,business
2,,,,,,,,{'Good_for_Kids': 1},cE27W9VPgO88Qxe4ol6y_g,Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",40.354115,-80.01466,Cool Springs Golf Center,[],False,5,2.5,PA,business
3,10:00 - 20:00,,10:00 - 16:00,,10:00 - 19:00,10:00 - 19:00,10:00 - 19:00,"{'Alcohol_full_bar': 1, 'Noise_Level_average':...",mVHrayjG3uZ_RLHkLj-AMg,Braddock,"414 Hawkins Ave\nBraddock, PA 15104",40.40883,-79.866211,Emil's Lounge,[],True,26,4.5,PA,business
4,11:00 - 20:00,,11:00 - 20:00,10:00 - 15:00,11:00 - 20:00,,11:00 - 20:00,"{'Parking_garage': 0, 'Parking_street': 0, 'Pa...",mYSpR_SLPgUVymYOvTQd_Q,Braddock,"1000 Clubhouse Dr\nBraddock, PA 15104",40.403405,-79.855782,Grand View Golf Club,[],True,3,5.0,PA,business


# Exercise 1: Create a new column that contains only the zipcode.

**get_postal:**
Using regex parses postal code information from full address string. 
Expected postal code formats are:
    
    nnnnn
    nnnnn-nnnn
    AAn nAn
    AAn(n) nAA

Where _A_ is upper-case aplpha character and _n_ is a digit


In [59]:
def get_postal(address):
    
    output = None
    pattern_zip5 = '(^|\s)\d{5}($|\s)'
    pattern_zip9 = '(^|\s)\d{5}-\d{4}($|\s)'
    pattern_CAN_EURO = '(^|\s)[A-Z0-9]{3}([0-9]?) [A-Z0-9]{3}($|\s)'
    
    if re.search(pattern_zip5, address)!= None:
        if re.search(pattern_zip9, address)!= None:
            output = re.search(pattern_zip9, address).group(0).strip()
        else:
            output = re.search(pattern_zip5, address).group(0).strip()
            
    if re.search(pattern_CAN_EURO, address)!= None:
        output = re.search(pattern_CAN_EURO, address).group(0).strip()
        
    return output

In [60]:
df_business['postal'] = df_business['full_address'].apply(lambda x: get_postal(x))
df_business[['business_id','full_address','postal']].head()

Unnamed: 0,business_id,full_address,postal
0,5UmKMjUEUNdYWqANhGckJw,"4734 Lebanon Church Rd\nDravosburg, PA 15034",15034
1,UsFtqoBl7naz8AVUBZMjQQ,"202 McClure St\nDravosburg, PA 15034",15034
2,cE27W9VPgO88Qxe4ol6y_g,"1530 Hamilton Rd\nBethel Park, PA 15234",15234
3,mVHrayjG3uZ_RLHkLj-AMg,"414 Hawkins Ave\nBraddock, PA 15104",15104
4,mYSpR_SLPgUVymYOvTQd_Q,"1000 Clubhouse Dr\nBraddock, PA 15104",15104


** my test data**

In [61]:
test_ids = [
    '1IxCNVVW9aKbK3BfQpI7eg','5UmKMjUEUNdYWqANhGckJw','5dUDkaxm5FdQanoNkP86KQ',
    'u9Ns-n_3zaZItaAbxjMidQ','W2RFJg1N_MzuI8I42MQ2VA','W3THJEcoHZZKgdJJdiFIWw',
    'DaE0cV5m3yGaPEaMEUnzZg','d_JAhjTX4RojS1Bj1uNXGg','HvP4Tvx9mihceKb2gnbhgw',
    'i9JeI7bd1qXGi15FoD45GA']
test_data = (df_business['business_id'].isin(test_ids))
df_test = df_business[test_data]

Unnamed: 0,business_id,full_address,Monday_hours,Tuesday_hours,Wednesday_hours,Thursday_hours,Friday_hours,Saturday_hours,Sunday_hours,postal
0,5UmKMjUEUNdYWqANhGckJw,"4734 Lebanon Church Rd\nDravosburg, PA 15034",11:00 - 21:00,11:00 - 21:00,11:00 - 21:00,11:00 - 21:00,11:00 - 21:00,,,15034
1661,1IxCNVVW9aKbK3BfQpI7eg,"7617 Mineral Point Rd\nSte 202\nMadison, WI 53717",09:00 - 17:00,09:00 - 17:00,09:00 - 17:00,09:00 - 17:00,09:00 - 17:00,,,53717
1890,i9JeI7bd1qXGi15FoD45GA,"4801 Annamark Drive\nMadison, WI 53704",00:00 - 00:00,00:00 - 00:00,00:00 - 00:00,00:00 - 00:00,00:00 - 00:00,00:00 - 00:00,00:00 - 00:00,53704
32356,u9Ns-n_3zaZItaAbxjMidQ,"376 Kings Street E\nKitchener, ON N2G 2L5",08:00 - 21:00,08:00 - 21:00,08:00 - 21:00,08:00 - 21:00,08:00 - 21:00,07:00 - 21:00,08:00 - 21:00,N2G 2L5
33494,W3THJEcoHZZKgdJJdiFIWw,39 Queensferry St\nWest End\nEdinburgh EH2 4QS,,,,,,,,EH2 4QS


# Exercise 2: The table contains a column called 'categories' and each entry in this column is populated by a list. We are interested in those businesses that restaurants. Create a new column 'Restaurant_type' that contains a description of the restaurant based on the other elements of 'categories. 
## That is, if we have '[Sushi Bars, Japanese, Restaurants]' in categories the 'Restaurant_type will be '{'SushiBars': 1, 'Japanese': 1, 'Mexican': 0, ...}'

In [113]:
df_restaurants = pd.DataFrame(prepare_business_data_cats(business_data))
df_restaurants.head()

Unnamed: 0,3D_Printing,ATV_Rentals_Tours,Accessories,Accountants,Active_Life,Acupuncture,Addiction_Medicine,Adult,Adult_Education,Adult_Entertainment,...,Wine_Tasting_Room,Wine_Tours,Wineries,Wok,Women_s_Clothing,Yelp_Events,Yoga,Ziplining,Zoos,business_id
0,,,,,,,,,,,...,,,,,,,,,,5UmKMjUEUNdYWqANhGckJw
1,,,,,,,,,,,...,,,,,,,,,,UsFtqoBl7naz8AVUBZMjQQ
2,,,,,1.0,,,,,,...,,,,,,,,,,cE27W9VPgO88Qxe4ol6y_g
3,,,,,,,,,,,...,,,,,,,,,,mVHrayjG3uZ_RLHkLj-AMg
4,,,,,1.0,,,,,,...,,,,,,,,,,mYSpR_SLPgUVymYOvTQd_Q


In [114]:
only_restaurants = df_restaurants['Restaurants'] == 1
print(len(df_restaurants)) # 85901
df_restaurants = df_restaurants[only_restaurants]
print(len(df_restaurants)) # 26729

85901
26729


In [115]:
def remove_unused_cats(df):
    drop = [column for column in df.columns if df[column].sum() == 0]
    df = df.drop(drop, 1)
    return df

In [116]:
df_restaurants = remove_unused_cats(df_restaurants)
df_restaurants.head()


Unnamed: 0,Active_Life,Adult_Entertainment,Afghan,African,Alsatian,Amateur_Sports_Teams,American_New,American_Traditional,Amusement_Parks,Antiques,...,Vinyl_Records,Vitamins_Supplements,Waffles,Whiskey_Bars,Wine_Bars,Wine_Tasting_Room,Wineries,Wok,Yoga,business_id
0,,,,,,,,,,,...,,,,,,,,,,5UmKMjUEUNdYWqANhGckJw
3,,,,,,,1.0,,,,...,,,,,,,,,,mVHrayjG3uZ_RLHkLj-AMg
5,,,,,,,,1.0,,,...,,,,,,,,,,KayYbHCt-RkbGcPdGOThNg
12,,,,,,,,1.0,,,...,,,,,,,,,,wJr6kSA5dchdgOdwH6dZ2w
17,,,,,,,,1.0,,,...,,,,,,,,,,fNGIbpazjTRdXgwRY_NIXA


In [126]:
def reduce_cats(df):
    categories = lambda x: {idx: 1 if x[idx]==1 else 0 for idx in x.index if idx not in ['business_id','Restaurants']}

    df['Category'] = df.apply(categories, axis=1)
    drop = [column for column in df.columns if column not in ['business_id','Restaurants','Category']]
    df = df.drop(drop, 1)
    return df

df_restaurants = reduce_cats(df_restaurants)
df_restaurants.head()

Unnamed: 0,Restaurants,business_id,Category
0,1.0,5UmKMjUEUNdYWqANhGckJw,"{'Active_Life': 0, 'Adult_Entertainment': 0, '..."
3,1.0,mVHrayjG3uZ_RLHkLj-AMg,"{'Active_Life': 0, 'Adult_Entertainment': 0, '..."
5,1.0,KayYbHCt-RkbGcPdGOThNg,"{'Active_Life': 0, 'Adult_Entertainment': 0, '..."
12,1.0,wJr6kSA5dchdgOdwH6dZ2w,"{'Active_Life': 0, 'Adult_Entertainment': 0, '..."
17,1.0,fNGIbpazjTRdXgwRY_NIXA,"{'Active_Life': 0, 'Adult_Entertainment': 0, '..."


# Exercise 3: Lets clean the 'attributes' column. The entries in this column are dictionaries. We need to do two things: 
## 1) Turn all the True or False in the dictionary to 1s and 0s.
## 2) There are some entries within dictionaries that are dictionaries themselves, lets turn the whole entry into just one dictionary, for example if we have 
### '{'Accepts Credit Cards': True, 'Alcohol': 'none','Ambience': {'casual': False,'classy': False}}' 
### then turn it into
### '{'Accepts Credit Cards':1, 'Alcohol_none': 1, 'Ambience_casual': 0, 'Ambience_classy': 0}'. 
### There might be other entries like {'Price Range': 1} where the values are numerical so we might want to change that into {'Price_Range_1': 1}.

In [264]:
print ('Done...')

Done...


# Exercise 4: Create a new column for every day of the week and fill it with the amount of hours the business is open that day.

In [127]:
def get_hour_diff(value):
    return value if pd.isnull(value) else datetime.strptime(value[8:], '%H:%M') - datetime.strptime(value[:5], '%H:%M')

def add_open_hours (df):
    hours_series = lambda x: pd.Series([
        get_hour_diff(x['Monday_hours']),
        get_hour_diff(x['Tuesday_hours']),
        get_hour_diff(x['Wednesday_hours']),
        get_hour_diff(x['Thursday_hours']),
        get_hour_diff(x['Friday_hours']),
        get_hour_diff(x['Saturday_hours']),
        get_hour_diff(x['Sunday_hours'])])

    newcols = df.apply(hours_series, axis=1)
    newcols.columns = ['Mon_open_hrs','Tue_open_hrs','Wed_open_hrs','Thu_open_hrs',
                   'Fri_open_hrs','Sat_open_hrs','Sun_open_hrs']
    df = df.join(newcols)
    return df

In [128]:
df_business = add_open_hours(df_business)
df_business[['Mon_open_hrs','Tue_open_hrs','Wed_open_hrs','Thu_open_hrs',
            'Fri_open_hrs','Sat_open_hrs','Sun_open_hrs']].head()

Unnamed: 0,Mon_open_hrs,Tue_open_hrs,Wed_open_hrs,Thu_open_hrs,Fri_open_hrs,Sat_open_hrs,Sun_open_hrs
0,10:00:00,10:00:00,10:00:00,10:00:00,10:00:00,NaT,NaT
1,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,NaT,NaT,NaT,NaT,NaT,NaT,NaT
3,NaT,09:00:00,09:00:00,09:00:00,10:00:00,06:00:00,NaT
4,NaT,NaT,09:00:00,09:00:00,09:00:00,09:00:00,05:00:00


# Exercise 5: Create a table with the average review for a business.

In [129]:
def read_review_json(file_name):
    json_data = []
    with open(file_name,'r', encoding='utf-8') as f:
        data = f.read()
        for line in data.split('\n')[:-1]:
            json_data.append({k: v for k, v in json.loads(line).items() if k in ['business_id','stars']})
    print('%s is done...' % file_name)
    return json_data

df_review = pd.DataFrame(read_review('yelp_academic_dataset_review.json'))

yelp_academic_dataset_review.json is done...


In [132]:
# explore...
len(df_review) # 2685066
df_review.head()

# Make an actual table
df_review = df_review.groupby('business_id').mean()
df_review = df_review.reset_index()
len(df_review) # 85539
df_review.head()

# Exercise 6: Create a new table that only contains restaurants with the following schema:
## Business_Name | Restaurant_type | Friday hours | Saturday hours | Attributes | Zipcode | Average Rating

In [159]:
restaurant_master = pd.merge(
    df_business[['business_id','name','Friday_hours','Saturday_hours','attributes','postal']],
    df_restaurants[['business_id','Category']], 
    how = 'inner',
    on = 'business_id')

restaurant_master.head()
len(restaurant_master) # 26729

restaurant_master = pd.merge(restaurant_master, df_review, how = 'left', on = 'business_id')

print(len(restaurant_master)) # 26729
restaurant_master.columns = ['business_id','Business Name', 'Friday hours','Saturday hours',
                             'Attributes','Postal','Restaurant type','Average Rating']
restaurant_master.head()


26729


Unnamed: 0,business_id,Business Name,Friday hours,Saturday hours,Attributes,Postal,Restaurant type,Average Rating
0,5UmKMjUEUNdYWqANhGckJw,Mr Hoagie,11:00 - 21:00,,"{'Take_out': 1, 'Drive_Thru': 0, 'Good_For_des...",15034,"{'Active_Life': 0, 'Adult_Entertainment': 0, '...",3.428571
1,mVHrayjG3uZ_RLHkLj-AMg,Emil's Lounge,10:00 - 20:00,10:00 - 16:00,"{'Alcohol_full_bar': 1, 'Noise_Level_average':...",15104,"{'Active_Life': 0, 'Adult_Entertainment': 0, '...",4.68
2,KayYbHCt-RkbGcPdGOThNg,Alexion's Bar & Grill,11:00 - 02:00,12:00 - 02:00,"{'Alcohol_full_bar': 1, 'Noise_Level_loud': 1,...",15106,"{'Active_Life': 0, 'Adult_Entertainment': 0, '...",3.894737
3,wJr6kSA5dchdgOdwH6dZ2w,Kings Family Restaurant,08:00 - 02:00,08:00 - 02:00,"{'Take_out': 1, 'Drive_Thru': 0, 'Good_For_des...",15106,"{'Active_Life': 0, 'Adult_Entertainment': 0, '...",3.25
4,fNGIbpazjTRdXgwRY_NIXA,Rocky's Lounge,11:00 - 23:00,11:00 - 23:00,"{'Alcohol_full_bar': 1, 'Noise_Level_average':...",15106,"{'Active_Life': 0, 'Adult_Entertainment': 0, '...",3.8
