In [85]:
import collections
import sqlite3
import json
from datetime import datetime, timedelta
import re
import pandas as pd

business_json_file = '../data/unit_06/yelp/yelp_academic_dataset_business.json'
review_json_files = ['../data/unit_06/yelp/yelp_academic_dataset_reviewaa',
                    '../data/unit_06/yelp/yelp_academic_dataset_reviewab',
                    '../data/unit_06/yelp/yelp_academic_dataset_reviewac']

**read_json_file:** Generic function that reads a json file and returns a list of lists

In [39]:
def read_json_file(file_name):
    json_data = []
    with open(file_name,'r', encoding='utf-8') as f:
        data = f.read()
        for line in data.split('\n')[:-1]:
            json_data.append(json.loads(line))
    print('done reading file...')
    return json_data

**read_review_json_file:** function that reads just *business_id* and *stars* from yelp review json file and returns a list of lists. Also for MacOS had to split original file because of this bug: https://bugs.python.org/issue24658
Used this command to split the original file:

     split -l 1000000 yelp_academic_dataset_review.json yelp_academic_dataset_review

In [82]:
def read_review_json_file(file_names):
    json_data = []
    for file_name in file_names:
        with open(file_name,'r', encoding='utf-8') as f:
            data = f.read()
            for line in data.split('\n')[:-1]:
                json_data.append({k: v for k, v in json.loads(line).items() if k in ['business_id','stars']})
    print('done reading review file...')
    return json_data

**cleanup_string:** takes string as a parameter and replaces characters ' ()&,-/\'' or a cluster thereof with a single underscore ('_')

In [41]:
def cleanup_string(input_str): 
    output = ''
    special = ' ()&,-/\''
    replaced = False
    for letter in input_str:
        if letter not in special:
            output += letter
            replaced = False
        else:
            if not replaced:
                output += '_'
                replaced = True
    return output.strip('_')

**flatten_hours:**
takes "hours" nested dictionary as a parameter, such as

    {"Friday": {"close": "21:00", "open": "11:00"}, 
    "Tuesday": {"close": "21:00", "open": "11:00"}, 
    "Thursday": {"close": "21:00", "open": "11:00"}, 
    "Wednesday": {"close": "21:00", "open": "11:00"}, 
    "Monday": {"close": "21:00", "open": "11:00"}}

and converts it into a flattened set of key / value pairs like this:

    {'Friday_hours': '11:00 - 21:00', 
    'Tuesday_hours': '11:00 - 21:00', 
    'Thursday_hours': '11:00 - 21:00', 
    'Wednesday_hours': '11:00 - 21:00', 
    'Monday_hours': '11:00 - 21:00'}


In [42]:
def flatten_hours (hours):
    flatten = {k+'_hours': v['open']+ ' - ' + v['close'] for k, v in hours.items()}
    return flatten

**flatten_list:**
takes a list as a parameters and converts it into a dictionary with list item as key value being 1.
Example:

    input: ['Fast Food', 'Restaurants']
    output : {'Fast_Food': 1, 'Restaurants': 1}

When optional key prefix is provided, then prefix is appended at the beginning of the key.
Example:

    input: ['Fast Food', 'Restaurants']. 'Category'
    output : {'Category_Fast_Food': 1, 'Category_Restaurants': 1}
*cleanup_string* function applied to a key.

In [43]:
def flatten_list (source_list, key_prefix=''):
    flatten = {cleanup_string(key_prefix+' '+item): 1 for item in source_list}
    return flatten

**flatten_dict:**
Takes a dictionary as a parameters and converts it into another dictionary with binary values.
1. True/False values are converted to 1/0 respectively
2. When value is other than True/False, then value is appended to a key and value of a new key becomes 1
3. When value is a dictionary, then nested dictionary is flattened using logic above and parent key is used as a prefix to a key of a nested dictionary.
4. *cleanup_string* function applied to all newly generated keys.

Example of a dictionary:

    input: 
    {'Take-out': True, 
     'Drive-Thru': False, 
     'Caters': False, 
     'Noise Level': 'average'}
    output: 
    {'Take_out': 1, 
     'Drive_Thru': 0 
     'Caters': 0, 
     'Noise_Level_average':1}
     
Example of nested dictionary:

    input:
    {'Ambience': {
         'romantic': False, 
         'intimate': False, 
         'trendy': True, 
         'upscale': True}}
    output:
    {'Ambience_romantic': 0, 
     'Ambience_intimate': 0, 
     'Ambience_trendy': 1, 
     'Ambience_upscale': 1}

In [44]:
def flatten_dict (source_dict, key_prefix=''):
    flatten = {}
    for k, v in source_dict.items():
        if type(v) == type({}):
            flatten = dict(**flatten, **flatten_dict(v, key_prefix + ' ' + k))
        else:
            if str(v) in ['True', 'False']:
                flatten[cleanup_string(key_prefix + ' ' +k)] = (1 if v else 0)
            else:
                flatten[cleanup_string(key_prefix + ' ' +k + ' ' + str(v))] = 1

    return flatten

**prepare_business_data:**
Converts yelp business json data into flattened dictionary with :
- business_id, name, type, open, latitude, longitude, full_address, city, state, review_count, stars, and neighborhoods keys as is
- hours of operation keys per available weekday using *flatten_hours* function
- attribute key with one value carrying all coresponding attributes using *flatten_dict* function

In [45]:
def prepare_business_data(data):
    
    output = [{
    'business_id': info['business_id'],
    'name': info['name'],
    'type': info['type'],
    'open': info['open'],
    'latitude': info['latitude'],
    'longitude': info['longitude'],
    'full_address': info['full_address'],
    'city': info['city'],
    'state': info['state'],
    'review_count': info['review_count'],
    'stars': info['stars'],
    'neighborhoods': info['neighborhoods'],
    'attributes': flatten_dict(info['attributes']),
    **flatten_hours(info['hours'])}
    for info in data]
    
    return output

**prepare_business_data_cats:**
Converts yelp business json data into flattened dictionaries with business_id and categories with binary values


In [46]:
def prepare_business_data_cats(data):
    
    output = [{
    'business_id': info['business_id'],
    ** flatten_list(info['categories'])}
    for info in data]
    
    return output

**remove_unused_cats:** now that I have enormous amount of columns for each possible category with values (0, 1, NaN), which makes about over 1K in this particular dataset, I want to drop the ones that don't have any 1's to make my dataframe narrower. 

In [47]:
def remove_unused_cats(df):
    drop = [column for column in df.columns if df[column].sum() == 1] 
    df = df.drop(drop, 1)
    return df

**reduce_cats:** now I want to make the dataframe even narrower consolidating all categories into one column, where value of this new column is a dictionary 

    {category_1: < 1 if column value is 1 otherwise 0 >, 
    category_2: <1 or 0>, 
    ...}

Why did I not do it at the beginning? Because it's impossible to hardcode all 1K+ categories, whereas expanding the dataframe and collapsing it back pretty much takes care of sparse categories.

In [48]:
def reduce_cats(df):
    categories = lambda x: {idx: 1 if x[idx]==1 else 0 for idx in x.index if idx not in ['business_id','Restaurants']}

    df['Category'] = df.apply(categories, axis=1)
    drop = [column for column in df.columns if column not in ['business_id','Restaurants','Category']]
    df = df.drop(drop, 1)
    return df

**get_postal:**
Using regex parses postal code information from full address string. 
Expected postal code formats are:
    
    nnnnn
    nnnnn-nnnn
    AAn nAn
    AAn(n) nAA

Where _A_ is upper-case aplpha character and _n_ is a digit


In [49]:
def get_postal(address):
    
    output = None
    pattern_zip5 = '(^|\s)\d{5}($|\s)'
    pattern_zip9 = '(^|\s)\d{5}-\d{4}($|\s)'
    pattern_CAN_EURO = '(^|\s)[A-Z0-9]{3}([0-9]?) [A-Z0-9]{3}($|\s)'
    
    if re.search(pattern_zip5, address)!= None:
        if re.search(pattern_zip9, address)!= None:
            output = re.search(pattern_zip9, address).group(0).strip()
        else:
            output = re.search(pattern_zip5, address).group(0).strip()
            
    if re.search(pattern_CAN_EURO, address)!= None:
        output = re.search(pattern_CAN_EURO, address).group(0).strip()
        
    return output

**get_hour_diff:** takes "HH:MM - HH:MM" as a parameter and returns time difference

**add_open_hours:** adds *< Weekday>_hours* columns to a dataframe with the time difference calculated by *get_hour_diff*

In [50]:
def get_hour_diff(value):
    return value if pd.isnull(value) else datetime.strptime(value[8:], '%H:%M') - datetime.strptime(value[:5], '%H:%M')

def add_open_hours (df):
    hours_series = lambda x: pd.Series([
        get_hour_diff(x['Monday_hours']),
        get_hour_diff(x['Tuesday_hours']),
        get_hour_diff(x['Wednesday_hours']),
        get_hour_diff(x['Thursday_hours']),
        get_hour_diff(x['Friday_hours']),
        get_hour_diff(x['Saturday_hours']),
        get_hour_diff(x['Sunday_hours'])])

    newcols = df.apply(hours_series, axis=1)
    newcols.columns = ['Mon_open_hrs','Tue_open_hrs','Wed_open_hrs','Thu_open_hrs',
                   'Fri_open_hrs','Sat_open_hrs','Sun_open_hrs']
    df = df.join(newcols)
    return df

# Exercise 1: Create a new column that contains only the zipcode.

In [51]:
## OK, let's do this...

In [52]:
business_data = read_json_file(business_json_file)

df_business = pd.DataFrame(prepare_business_data(business_data))

df_business['postal'] = df_business['full_address'].apply(lambda x: get_postal(x))
df_business[['business_id','full_address','postal']].head()

done reading file...


Unnamed: 0,business_id,full_address,postal
0,5UmKMjUEUNdYWqANhGckJw,"4734 Lebanon Church Rd\nDravosburg, PA 15034",15034
1,UsFtqoBl7naz8AVUBZMjQQ,"202 McClure St\nDravosburg, PA 15034",15034
2,cE27W9VPgO88Qxe4ol6y_g,"1530 Hamilton Rd\nBethel Park, PA 15234",15234
3,mVHrayjG3uZ_RLHkLj-AMg,"414 Hawkins Ave\nBraddock, PA 15104",15104
4,mYSpR_SLPgUVymYOvTQd_Q,"1000 Clubhouse Dr\nBraddock, PA 15104",15104


** my test data**

In [53]:
test_ids = [
    '1IxCNVVW9aKbK3BfQpI7eg','5UmKMjUEUNdYWqANhGckJw','5dUDkaxm5FdQanoNkP86KQ',
    'u9Ns-n_3zaZItaAbxjMidQ','W2RFJg1N_MzuI8I42MQ2VA','W3THJEcoHZZKgdJJdiFIWw',
    'DaE0cV5m3yGaPEaMEUnzZg','d_JAhjTX4RojS1Bj1uNXGg','HvP4Tvx9mihceKb2gnbhgw',
    'i9JeI7bd1qXGi15FoD45GA']
test_data = (df_business['business_id'].isin(test_ids))
df_test = df_business[test_data]

# Exercise 2: The table contains a column called 'categories' and each entry in this column is populated by a list. We are interested in those businesses that restaurants. Create a new column 'Restaurant_type' that contains a description of the restaurant based on the other elements of 'categories. 
## That is, if we have '[Sushi Bars, Japanese, Restaurants]' in categories the 'Restaurant_type will be '{'SushiBars': 1, 'Japanese': 1, 'Mexican': 0, ...}'

In [54]:
df_restaurants = pd.DataFrame(prepare_business_data_cats(business_data))

only_restaurants = df_restaurants['Restaurants'] == 1

#len(df_restaurants) before = 85901

df_restaurants = df_restaurants[only_restaurants]
#len(df_restaurants) after = 26729

df_restaurants = remove_unused_cats(df_restaurants)
df_restaurants = reduce_cats(df_restaurants)

In [55]:
df_restaurants.head()

Unnamed: 0,Restaurants,business_id,Category
0,1.0,5UmKMjUEUNdYWqANhGckJw,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ..."
3,1.0,mVHrayjG3uZ_RLHkLj-AMg,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ..."
5,1.0,KayYbHCt-RkbGcPdGOThNg,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ..."
12,1.0,wJr6kSA5dchdgOdwH6dZ2w,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ..."
17,1.0,fNGIbpazjTRdXgwRY_NIXA,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ..."


# Exercise 3: Lets clean the 'attributes' column. The entries in this column are dictionaries. We need to do two things: 
## 1) Turn all the True or False in the dictionary to 1s and 0s.
## 2) There are some entries within dictionaries that are dictionaries themselves, lets turn the whole entry into just one dictionary, for example if we have 
### '{'Accepts Credit Cards': True, 'Alcohol': 'none','Ambience': {'casual': False,'classy': False}}' 
### then turn it into
### '{'Accepts Credit Cards':1, 'Alcohol_none': 1, 'Ambience_casual': 0, 'Ambience_classy': 0}'. 
### There might be other entries like {'Price Range': 1} where the values are numerical so we might want to change that into {'Price_Range_1': 1}.

In [264]:
print ('Done...')

Done...


# Exercise 4: Create a new column for every day of the week and fill it with the amount of hours the business is open that day.

In [57]:
df_business = add_open_hours(df_business)

In [58]:
df_business[['Mon_open_hrs','Tue_open_hrs','Wed_open_hrs','Thu_open_hrs',
            'Fri_open_hrs','Sat_open_hrs','Sun_open_hrs']].head()

Unnamed: 0,Mon_open_hrs,Tue_open_hrs,Wed_open_hrs,Thu_open_hrs,Fri_open_hrs,Sat_open_hrs,Sun_open_hrs
0,10:00:00,10:00:00,10:00:00,10:00:00,10:00:00,NaT,NaT
1,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,NaT,NaT,NaT,NaT,NaT,NaT,NaT
3,NaT,09:00:00,09:00:00,09:00:00,10:00:00,06:00:00,NaT
4,NaT,NaT,09:00:00,09:00:00,09:00:00,09:00:00,05:00:00


# Exercise 5: Create a table with the average review for a business.

In [88]:
## This might take a while...
df_review = pd.DataFrame(read_review_json_file(review_json_files))

# explore...
len(df_review) # 2685066
df_review.head()

# Make an actual table
df_review = df_review.groupby('business_id').mean()
df_review = df_review.reset_index()
len(df_review) # 85539

done reading review file...


85539

In [89]:
df_review.head()

Unnamed: 0,business_id,stars
0,--0ZoBTQWQra1FxD4rBWmg,2.0
1,--1emggGHgoG6ipd_RMb-g,3.333333
2,--4Pe8BZ6gj57VFL5mUE8g,2.75
3,--5jkZ3-nUPZxUvtcbr8Uw,4.580357
4,--7PRjnsjMA6uhPK8mW13Q,2.666667


# Exercise 6: Create a new table that only contains restaurants with the following schema:
## Business_Name | Restaurant_type | Friday hours | Saturday hours | Attributes | Zipcode | Average Rating

In [90]:
restaurant_master = pd.merge(
    df_business[['business_id','name','Friday_hours','Saturday_hours','attributes','postal']],
    df_restaurants[['business_id','Category']], 
    how = 'inner',
    on = 'business_id')

restaurant_master = pd.merge(restaurant_master, df_review, how = 'left', on = 'business_id')

restaurant_master.columns = ['business_id','Business Name', 'Friday hours','Saturday hours',
                             'Attributes','Postal','Restaurant type','Average Rating']

print(len(restaurant_master)) 
#Should be the same as len(df_restaurants), which is 26729

26729


In [91]:
restaurant_master.head()

Unnamed: 0,business_id,Business Name,Friday hours,Saturday hours,Attributes,Postal,Restaurant type,Average Rating
0,5UmKMjUEUNdYWqANhGckJw,Mr Hoagie,11:00 - 21:00,,"{'Price_Range_1': 1, 'Drive_Thru': 0, 'Parking...",15034,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ...",3.428571
1,mVHrayjG3uZ_RLHkLj-AMg,Emil's Lounge,10:00 - 20:00,10:00 - 16:00,"{'Music_dj': 0, 'Caters': 1, 'Ambience_tourist...",15104,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ...",4.68
2,KayYbHCt-RkbGcPdGOThNg,Alexion's Bar & Grill,11:00 - 02:00,12:00 - 02:00,"{'Music_dj': 0, 'Caters': 0, 'Ambience_tourist...",15106,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ...",3.894737
3,wJr6kSA5dchdgOdwH6dZ2w,Kings Family Restaurant,08:00 - 02:00,08:00 - 02:00,"{'Drive_Thru': 0, 'Parking_street': 0, 'Takes_...",15106,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ...",3.25
4,fNGIbpazjTRdXgwRY_NIXA,Rocky's Lounge,11:00 - 23:00,11:00 - 23:00,"{'Music_dj': 0, 'Caters': 0, 'Ambience_tourist...",15106,"{'Hearing_Aid_Providers': 0, 'Sushi_Bars': 0, ...",3.8
