In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import pickle
import re
import string
from fuzzywuzzy import fuzz
from merge_charlotte import parse_address, fuzz_comparisons, append_fuzz_scores
import merge_charlotte as lib

%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
fname_business = '../data/yelp/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json'

In [3]:
with open(fname_business) as f:
    B = pd.DataFrame(json.loads(line) for line in f)

In [4]:
B.describe()

Unnamed: 0,latitude,longitude,open,review_count,stars
count,61184.0,61184.0,61184,61184.0,61184.0
mean,37.326077,-97.491803,0.878089,28.272506,3.673305
std,5.741085,29.624473,0.327186,88.65205,0.891207
min,32.871923,-115.38655,False,3.0,1.0
25%,33.509601,-115.052224,1,4.0,3.0
50%,35.994636,-111.932944,1,8.0,3.5
75%,36.24043,-80.863329,1,21.0,4.5
max,56.036545,8.549249,True,4578.0,5.0


In [5]:
B.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61184 entries, 0 to 61183
Data columns (total 15 columns):
attributes       61184 non-null object
business_id      61184 non-null object
categories       61184 non-null object
city             61184 non-null object
full_address     61184 non-null object
hours            61184 non-null object
latitude         61184 non-null float64
longitude        61184 non-null float64
name             61184 non-null object
neighborhoods    61184 non-null object
open             61184 non-null bool
review_count     61184 non-null int64
stars            61184 non-null float64
state            61184 non-null object
type             61184 non-null object
dtypes: bool(1), float64(3), int64(1), object(10)
memory usage: 7.1+ MB


In [6]:
B.head().T

Unnamed: 0,0,1,2,3,4
attributes,{u'By Appointment Only': True},"{u'Happy Hour': True, u'Accepts Credit Cards':...",{u'Good for Kids': True},{},"{u'Alcohol': u'full_bar', u'Noise Level': u'av..."
business_id,vcNAWiLM4dR7D2nwwJ7nCA,UsFtqoBl7naz8AVUBZMjQQ,cE27W9VPgO88Qxe4ol6y_g,HZdLhv6COCleJMo7nPl-RA,mVHrayjG3uZ_RLHkLj-AMg
categories,"[Doctors, Health & Medical]",[Nightlife],"[Active Life, Mini Golf, Golf]","[Shopping, Home Services, Internet Service Pro...","[Bars, American (New), Nightlife, Lounges, Res..."
city,Phoenix,Dravosburg,Bethel Park,Pittsburgh,Braddock
full_address,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","202 McClure St\nDravosburg, PA 15034","1530 Hamilton Rd\nBethel Park, PA 15234","301 S Hills Vlg\nPittsburgh, PA 15241","414 Hawkins Ave\nBraddock, PA 15104"
hours,"{u'Thursday': {u'close': u'17:00', u'open': u'...",{},{},"{u'Monday': {u'close': u'21:00', u'open': u'10...","{u'Tuesday': {u'close': u'19:00', u'open': u'1..."
latitude,33.4993,40.3505,40.3569,40.3576,40.4087
longitude,-111.984,-79.8869,-80.0159,-80.06,-79.8664
name,"Eric Goldberg, MD",Clancy's Pub,Cool Springs Golf Center,Verizon Wireless,Emil's Lounge
neighborhoods,[],[],[],[],[]


In [7]:
B.state.value_counts()

AZ     25230
NV     16485
NC      4963
QC      3921
PA      3041
EDH     2971
WI      2307
BW       934
IL       627
ON       351
SC       189
MLN      123
RP        13
ELN       10
FIF        4
CA         3
SCB        3
MN         1
MA         1
KHL        1
XGL        1
WA         1
NW         1
OR         1
NTH        1
HAM        1
Name: state, dtype: int64

In [8]:
B_keep = B[B.state.isin({'AZ','NV','PA','WI','IL','NC'})]

In [9]:
B_keep.groupby('state').describe()[['review_count', 'stars']].unstack(-1)

Unnamed: 0_level_0,review_count,review_count,review_count,review_count,review_count,review_count,review_count,review_count,stars,stars,stars,stars,stars,stars,stars,stars
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
state,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
AZ,25230,25.238962,55.871449,3,4,8,21,1512,25230,3.684503,0.940502,1,3,4.0,4.5,5
IL,627,20.987241,33.924552,3,5,9,22,450,627,3.569378,0.897042,1,3,3.5,4.0,5
NC,4963,20.651823,40.02815,3,4,8,19,983,4963,3.588858,0.844681,1,3,3.5,4.0,5
NV,16485,45.672066,150.342727,3,5,10,29,4578,16485,3.658477,0.908426,1,3,3.5,4.5,5
PA,3041,23.810917,43.332042,3,4,9,24,695,3041,3.682999,0.829634,1,3,3.5,4.5,5
WI,2307,20.669267,37.064819,3,4,8,20,723,2307,3.643693,0.857121,1,3,3.5,4.5,5


In [10]:
B_keep['type'].value_counts()

business    52653
Name: type, dtype: int64

# Merge with Health Inspections

In [4]:
def save_to_pickle(data, fname):
    with open(fname, 'wb') as handle:
        pickle.dump(data, handle)


def open_pickle(f_name):
    with open(f_name, 'rb') as f:
        data = pickle.load(f)
    return data

### Phoenix, AZ

In [626]:
test_AZ = open_pickle('../data/phx/phoenix_yelp_merge.pkl')

In [627]:
test_AZ.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3984 entries, (1130 the restaurant, 455, north 3rd st, phoenix, 85004) to (9990, north 90th st, scottsdale, 85258)
Data columns (total 43 columns):
address                    3984 non-null object
addressy_                  3984 non-null object
attributes                 3984 non-null object
avg_2                      2250 non-null float64
avg_w_2                    2250 non-null float64
business_id                3984 non-null object
categories                 3984 non-null object
city_                      3984 non-null object
complex                    3984 non-null object
complexy_                  3984 non-null object
cutting_edge               3984 non-null object
full_address               3984 non-null object
fuzz_partial_ratio         2250 non-null float64
fuzz_partial_ratio_2       2250 non-null float64
fuzz_ratio                 2250 non-null float64
fuzz_ratio_2               2250 non-null float64
fuzz_token_set_ratio       2250 n

### Charlotte, NC

In [196]:
def split_address(x):
    x = x.lower().replace('\r','').replace('\n','')
    x = x.replace(' ste-',' ste ')
    x = re.sub('[%s]' % re.escape(string.punctuation.replace('&','')), '', x)
    abbr = {'road':'rd', 'street':'st', 'avenue':'av', 'ave':'av', 'drive':'dr', 'boulevard':'blvd',
            'n':'north', 'e':'east', 's':'south', 'w':'west', 'suite':'ste','bv':'blvd', 'suit':'ste',
            'pky':'pkwy', 'parkway':'pkwy',
            'first':'1st', 'second':'2nd', 'third':'3rd', 'fourth':'4th', 'fifth':'5th', 'sixth':'6th',
            'seventh':'7th', 'eighth':'8th', 'ninth':'9th', 'tenth':'10th'}
    for key, value in abbr.iteritems():
        x = re.sub(r'\b(%s)\b' % key, value, x) 
    x = x.strip()
    
    address = {}
    n = re.findall(r'^(\d+\D?)\b',x)
    s = re.findall(r'\b(ste\W?\D?\W?\d*\W?\D?)\Z', x)
    if len(n) > 0 and len(s) > 0:
        address['num'] = n[0].strip()
        address['street'] = re.sub(r'^(\d+\D?)\b | \b(ste\W?\D?\W?\d*\W?\D?)\Z', '', x)
        address['suite'] = s[-1]
        return pd.Series(address)
    elif len(n) > 0:
        address['num'] = n[0].strip()
        address['street'] = re.sub(r'^(\d+\D?)\b', '', x)
        address['suite'] = ''
        return pd.Series(address)
    else:
        return pd.Series({'num':'', 'street':x[0], 'suite':''})
    
def standard_name(x):
    rep_list = {'@':' at ', '-':' ', '/':' '}
    sub_list = {'grille':'grill', 'ristorante':'restaurant','restaurante':'restaurant',
                'italiano':'italian', 'mexicano':'mexican', 'mexicana':'mexican'}
    if type(x) == float:
        x = str(x)
    x = x.lower()
    for key, value in rep_list.iteritems():
        x = x.replace(key, value)
    for key, value in sub_list.iteritems():
        x = re.sub(r'\b(%s)\b' % key, value, x)
    
    x = x.strip()
    x = re.sub(r'(\#\d+)\Z','',x)
    x = re.split(r'\b(at)\b', x)[0].strip()
    return re.sub('[%s]' % re.escape(string.punctuation), '', x)

In [274]:
B_NC = B[(B.state=='NC')& (B.categories.apply(lambda x: 'Restaurants' in x))]
B_NC['address'] = B_NC.full_address.apply(lambda x: x.replace('\n',' ')\
                                                     .replace('Ste','Suite')\
                                                     .replace(',','')\
                                                     .replace(' NC ', ' '))

In [6]:
H = open_pickle('../data/char/char_FULL_04.pkl')

In [218]:
NC = pd.DataFrame.from_records(H).T
NC.head(1)

Unnamed: 0,address,city,county,inspections,name,state,type,zip
122131,1318 W MOREHEAD ST \r\n,CHARLOTTE,Mecklenburg,"[[2/26/2015, 90, A, 2398], [10/3/2014, 93, A, ...",OPEN KITCHEN,NC,1 - Restaurant,28208


In [219]:
NC['city_'] = NC.city.apply(lambda x: x.lower().strip())
NC['name_'] = NC.name.apply(standard_name)
NC['id_'] = NC.index

NC = pd.concat([NC, NC.address.apply(split_address)], axis=1)
NC.head(1)

Unnamed: 0,address,city,county,inspections,name,state,type,zip,city_,name_,id_,num,street,suite
122131,1318 W MOREHEAD ST \r\n,CHARLOTTE,Mecklenburg,"[[2/26/2015, 90, A, 2398], [10/3/2014, 93, A, ...",OPEN KITCHEN,NC,1 - Restaurant,28208,charlotte,open kitchen,122131,1318,west morehead st,


In [220]:
B_address = pd.Series(zip(B_NC.full_address, B_NC.neighborhoods)).apply(lambda x: lib.parse_address(*x))
B_address.set_index(B_NC.index, inplace=True)
col = B_address.columns.values
col[0] = 'city_'
B_address.columns = col
B_NC = pd.concat([B_NC, B_address], axis=1)
B_NC['name_'] = B_NC.name.apply(standard_name)

In [242]:
B_NC[B_NC.business_id=='C3QfGqD3qjWUbwBOkEjqSQ'].full_address.tolist()

[u'9630 University City Blvd\nSte F\nUniversity City\nCharlotte, NC 28213']

In [221]:
NC_ind = ['name_','num','street','city_','zip']
temp = NC.set_index(NC_ind).join(B_NC.set_index(NC_ind), how='inner', rsuffix = 'y_')
temp.shape

(469, 27)

In [222]:
H_x = NC[~NC.id_.isin(temp.id_)]
B_x = B_NC[~B_NC.business_id.isin(temp.business_id)]

In [223]:
NC_ind2 = ['num','street','city_','zip']
temp2 = H_x.set_index(NC_ind2).join(B_x.set_index(NC_ind2), how='inner', rsuffix = 'y_')

In [224]:
temp2.shape

(1057, 29)

In [None]:
temp2.to_csv('../data/char/merge_dump_01.csv', encoding='utf-8')

In [225]:
temp_fuzz = lib.append_fuzz_scores(temp2)

In [226]:
temp_fuzz.to_csv('../data/char/merge_dump_06.csv', encoding='utf-8')

In [216]:
reload(lib)

<module 'merge_charlotte' from 'merge_charlotte.py'>

In [227]:
ind_A = (temp_fuzz['max'] >= 75) & (temp_fuzz.avg_w_2 >= 75)
ind_B = (temp_fuzz['max'] >= 60) & (temp_fuzz.avg_w_2 >= 60) & (temp_fuzz.avg_w_3 >= 80)
NC_level2 = pd.concat([temp, temp_fuzz[ind_A | ind_B]])

In [228]:
NC_level2.shape

(832, 52)

In [229]:
H_x2 = NC[~NC.id_.isin(NC_level2.id_)]
B_x2 = B_NC[~B_NC.business_id.isin(NC_level2.business_id)]

In [230]:
NC_ind3 = ['city_','zip']
temp3 = H_x2.set_index(NC_ind3).join(B_x2.set_index(NC_ind3), how='inner', rsuffix = 'y_')

In [231]:
temp3.shape

(91324, 33)

In [232]:
temp_fuzz3 = lib.append_fuzz_scores(temp3)

In [233]:
ind_A3 = (temp_fuzz3['max'] >= 75) & (temp_fuzz3.avg_w_2 >= 75)
ind_B3 = (temp_fuzz3['max'] >= 60) & (temp_fuzz3.avg_w_2 >= 60) & (temp_fuzz3.avg_w_3 >= 80)
temp_fuzz3[ind_A3 | ind_B3].to_csv('../data/char/merge_dump_11.csv', encoding='utf-8')

In [239]:
temp_fuzz3.loc[temp_fuzz3.id_==166283,['full_address','business_id']]

Unnamed: 0_level_0,Unnamed: 1_level_0,full_address,business_id
city_,zip,Unnamed: 2_level_1,Unnamed: 3_level_1
charlotte,28213,10235 University City Blvd\nUniversity City\nC...,zKlU3yE_5RqNd00LVJE1oQ
charlotte,28213,"10900 University City Blvd\nCharlotte, NC 28213",NzUNtIOb7S2H1ymfp1gesA
charlotte,28213,9518 University City Blvd\nUniversity City\nCh...,Z6GFHuQXqLmlabISj9uI3Q
charlotte,28213,8500 University City Blvd\nUniversity City\nCh...,EssfLSLRxUou5uoj0LX_Kw
charlotte,28213,9510 University City Blvd\nUniversity City\nCh...,a_INgnfDhfsLaMG5O5QR8A
charlotte,28213,9630 University City Blvd\nUniversity City\nCh...,nu0giYqJOGLVsZLUdPwQOw
charlotte,28213,8428 University City Blvd\nUniversity City\nCh...,LH1cMyzVY6a2GpNAUCvh7g
charlotte,28213,8552 University City Blvd\nUniversity City\nCh...,Yolbffkso5NzevoEdIBEwA
charlotte,28213,10223 University City Blvd\nUniversity City\nC...,SNqqdZNsC1w8mL15SHJEWw
charlotte,28213,8420 University City Blvd\nUniversity City\nCh...,8-gfKVHSaBHHCmab4dUZWg


In [282]:
reload(lib)

<module 'merge_charlotte' from 'merge_charlotte.py'>

In [283]:
a = u'9630 University City Blvd\nSte F\nUniversity City\nCharlotte, NC 28213'
lib.parse_address(a,['University City'])

University City


city                  charlotte
complex                        
num                        9630
street     university city blvd
suite                         f
zip                       28213
dtype: object

In [281]:
a = 'hey  there how  \n are  you?'
re.sub('\s+',' ',a)

'hey there how are you?'