# IBM Data Science Capstone 


## Project Definition 
### Houston Restaurant Openning 
A business owner would like to open a restaurant in Houston, Texas. He would like to know two questions: 
> 1) What type of restaurant/cuisine he should consider? 

> 2) Where is the location in Houston he should open his restaurant. He would like a Data Scientist to help him to identify the most popular cuisine in Houston and the best location/neighborhood to potential have the highest traffic and customers.


### Data
> 1) List of Houston all restaurants, can be obtained from Yelp or other sources, to analyze and gain insights of current opened restaurants  
> 2) Location of all restaurants, can be obtained from Foursquare. To gain spatial information of restaurants  
> 3) Review/Ratings of all restaurants, can be obtained from Foursquare or Yelp.  
> 4) Houston district information, can be obtained from Wikipedia or Houston city government web

### Step 1: Scrape list of all restaurants in Houston from Yelp 

In [294]:
from lxml import html
import unicodecsv as csv
import requests
from time import sleep
import re
import argparse
import json
import pandas as pd


def parse(url):
    headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chrome/70.0.3538.77 Safari/537.36'}
    success = False
    
    for retry in range(10):
        response = requests.get(url, verify=False, headers=headers)
        if response.status_code == 200:
            success = True
            break
        else:
            print("Response received: %s. Retrying : %s"%(response.status_code, url))
            success = False
    
    if success == False:
        print("Failed to process the URL: ", url)
    
    parser = html.fromstring(response.text)
    listing = parser.xpath("//li[@class='regular-search-result']")
    raw_json = parser.xpath("//script[contains(@data-hypernova-key,'yelp_main__SearchApp')]//text()")
    scraped_datas = []

    
    # Case 1: Getting data from new UI
    if raw_json:
        print('Grabbing data from new UI')
        cleaned_json = raw_json[0].replace('<!--', '').replace('-->', '').strip()
        json_loaded = json.loads(cleaned_json)
        search_results = json_loaded['searchPageProps']['searchResultsProps']['searchResults']
        
        for results in search_results:
            result = results['searchResultBusiness']
            is_ad = result.get('isAd')
            price_range = result.get('priceRange')
            position = result.get('ranking')
            name = result.get('name')
            ratings = result.get('rating')
            reviews = result.get('reviewCount')
            address = result.get('formattedAddress')
            neighborhood = result.get('neighborhoods')
            category_list = result.get('categories')
            #full_address = address+' '+''.join(neighborhood)
            url = "https://www.yelp.com"+result.get('businessUrl')
            
            category = []
            for categories in category_list:
                category.append(categories['title'])
            business_category = ','.join(category)

            # Filtering out ads
            if is_ad == False:
                
                data = {
                    'business_name': name,
                    'rank': position,
                    'review_count': reviews,
                    'categories': business_category,
                    'rating': ratings,
                    'address': address,
     #               'neighborhood': neighborhood,
                    'price_range': price_range,
                    'url': url
                }
                scraped_datas.append(data)
        return scraped_datas

    # Case 2: Getting data from OLD UI
    if listing:
        print('Grabbing data from OLD UI')

        for results in listing:    
            raw_position = results.xpath(".//span[@class='indexed-biz-name']/text()")
            raw_name = results.xpath(".//span[@class='indexed-biz-name']/a//text()")
            raw_ratings = results.xpath(".//div[contains(@class,'rating-large')]//@title")
            raw_review_count = results.xpath(".//span[contains(@class,'review-count')]//text()")
            raw_price_range = results.xpath(".//span[contains(@class,'price-range')]//text()")
            category_list = results.xpath(".//span[contains(@class,'category-str-list')]//a//text()")
            raw_address = results.xpath(".//address//text()")
            is_reservation_available = results.xpath(".//span[contains(@class,'reservation')]")
            is_accept_pickup = results.xpath(".//span[contains(@class,'order')]")
            url = "https://www.yelp.com"+results.xpath(".//span[@class='indexed-biz-name']/a/@href")[0]

            name = ''.join(raw_name).strip()
            position = ''.join(raw_position).replace('.', '').strip()
            cleaned_reviews = ''.join(raw_review_count).strip()
            reviews =  re.sub("\D+", "", cleaned_reviews)
            categories = ','.join(category_list)
            cleaned_ratings = ''.join(raw_ratings).strip()
            if raw_ratings:
                ratings = re.findall("\d+[.,]?\d+", cleaned_ratings)[0]
            else:
                ratings = 0
            price_range = len(''.join(raw_price_range)) if raw_price_range else 0
            address  = ' '.join(' '.join(raw_address).split())
            reservation_available = True if is_reservation_available else False
            accept_pickup = True if is_accept_pickup else False
            data = {
                    'business_name': name,
                    'rank': position,
                    'review_count': reviews,
                    'categories': categories,
                    'rating': ratings,
                    'address': address,                    
                    'price_range': price_range,
                    'url': url
            }
            scraped_datas.append(data)
        return scraped_datas



In [297]:
place="Houston"
search_query="Restaurant"
df = pd.DataFrame(columns=['business_name','rank','review_count','categories','rating','address','price_range','url'])
for i in list(range(33)):

    yelp_url = "https://www.yelp.com/search?find_desc=%s&find_loc=%s&start=%s" % (search_query,place,str(i*30))
    print ("Retrieving :", yelp_url)
    scraped_data = parse(yelp_url)
    for data in scraped_data:
        this_df=pd.DataFrame(data, index=[0],columns=data.keys())
        df=df.append(this_df)

# clean up name and reindex:
df['business_name']=df['business_name'].str.replace('amp;','')
df['categories']=df['categories'].str.replace('amp;','')



Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=0




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=30




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=60




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=90




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=120




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=150




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=180




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=210




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=240




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=270




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=300




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=330




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=360




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=390




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=420




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=450




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=480




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=510




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=540




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=570




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=600




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=630




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=660




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=690




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=720




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=750




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=780




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=810




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=840




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=870




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=900




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=930




Grabbing data from new UI
Retrieving : https://www.yelp.com/search?find_desc=Restaurant&find_loc=Houston&start=960




Grabbing data from new UI


In [595]:
df1=df.drop_duplicates(['business_name'],keep=False)
df1=df1.reset_index(drop=True)
df1.head()

Unnamed: 0,business_name,rank,review_count,categories,rating,address,price_range,url
0,BCK Kitchen & Cocktail Adventures,1,453,"American (New),American (Traditional),Cocktail...",4.5,933 Studewood St,$$,https://www.yelp.com/biz/bck-kitchen-and-cockt...
1,Ritual,2,1087,"Southern,American (New)",4.0,602 Studewood St,$$,https://www.yelp.com/biz/ritual-houston?osq=Re...
2,The Pit Room,3,1135,Barbeque,4.5,1201 Richmond Ave,$$,https://www.yelp.com/biz/the-pit-room-houston?...
3,The Breakfast Klub,4,3704,"Breakfast & Brunch,Southern",4.5,3711 Travis St,$$,https://www.yelp.com/biz/the-breakfast-klub-ho...
4,Elliot’s Table,5,38,American (New),4.0,465 T C Jester Blvd,,https://www.yelp.com/biz/elliots-table-houston...


#### get all unique type of cuisine 

In [598]:

allcat=df1.categories.unique()
a=''
for i in allcat:
    a=a+','+i
a=a.lstrip(',')
unique_cuisine=set(a.split(","))

list_unique_cuisine=[]
for i in list(unique_cuisine):
    list_unique_cuisine.append(i.lstrip(' '))
list_unique_cuisine.sort()
  

#### count cuisine of number of restaurant 

In [299]:
cuisine_count={}
for cuisine in list(a.split(",")):
    if cuisine not in cuisine_count:
       cuisine_count[cuisine]=1
    else:
       cuisine_count[cuisine]+=1


cuisine_count_sorted = sorted(cuisine_count.items(), key=lambda kv: kv[1], reverse=True)

print("Ranking of No. of restaurnat for Cuisine: ")
cuisine_count_sorted[:20]

Ranking of No. of restaurnat for Cuisine: 


[('Breakfast & Brunch', 75),
 ('Seafood', 70),
 ('American (New)', 66),
 ('American (Traditional)', 57),
 ('Bars', 51),
 ('Burgers', 48),
 ('Sandwiches', 47),
 ('Coffee & Tea', 37),
 ('Food Trucks', 34),
 ('Mexican', 33),
 ('Cocktail Bars', 32),
 ('Southern', 29),
 ('Cajun/Creole', 29),
 ('Wine Bars', 28),
 ('Italian', 26),
 ('Pizza', 25),
 ('Steakhouses', 23),
 ('Cafes', 23),
 ('Mediterranean', 22),
 ('Bakeries', 22)]

#### sort the most popular restaurant (rating>=4, review_count>=100)

In [300]:
df_mostpop_rest=df1.loc[(df1['rating']>=4) & (df1['review_count']>=100)].sort_values(['rating','review_count'],ascending=[False,False])

In [301]:
allcat_mostpop_rest=df_mostpop_rest.categories.unique()
a_mostpop_rest=''
for i in allcat_mostpop_rest:
    a_mostpop_rest=a_mostpop_rest+','+i
a_mostpop_rest=a_mostpop_rest.lstrip(',')
unique_cuisine_mostpop_rest=set(a_mostpop_rest.split(","))

list_unique_cuisine_mostpop_rest=[]
for i in list(unique_cuisine_mostpop_rest):
    list_unique_cuisine_mostpop_rest.append(i.lstrip(' '))
list_unique_cuisine_mostpop_rest.sort()

cuisine_count_mostpop_rest={}
for cuisine in list(a_mostpop_rest.split(",")):
    if cuisine not in cuisine_count_mostpop_rest:
       cuisine_count_mostpop_rest[cuisine]=1
    else:
       cuisine_count_mostpop_rest[cuisine]+=1


cuisine_count_mostpop_rest_sorted = sorted(cuisine_count_mostpop_rest.items(), key=lambda kv: kv[1], reverse=True)

print("Most popular restaurant cuisine type:")
cuisine_count_mostpop_rest_sorted[:20]

Most popular restaurant cuisine type:


[('American (New)', 41),
 ('Breakfast & Brunch', 39),
 ('Seafood', 28),
 ('Coffee & Tea', 27),
 ('Sandwiches', 24),
 ('Wine Bars', 22),
 ('American (Traditional)', 20),
 ('Burgers', 18),
 ('Mexican', 17),
 ('Italian', 17),
 ('Pizza', 17),
 ('Bars', 16),
 ('Cocktail Bars', 16),
 ('Mediterranean', 14),
 ('Steakhouses', 14),
 ('Sushi Bars', 13),
 ('Southern', 12),
 ('Food Trucks', 12),
 ('Vietnamese', 12),
 ('Cafes', 12)]

### Step 2: Get Houston Neighbourhood from Wikipedia 

In [246]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [262]:
weburl=requests.get('https://en.wikipedia.org/wiki/List_of_Houston_neighborhoods').text
soup=BeautifulSoup(weburl,'lxml')
My_table = soup.find('table',{'class':'wikitable'})
neigh=My_table.findAll('tr')

In [271]:

df_neighbor=pd.DataFrame(columns=['Name','RelativeLocation'])
i=0
for i in range(1,len(neigh)):
    df_neighbor.loc[i]=[neigh[i].contents[3].text,neigh[i].contents[5].text]
    i=i+1
df_neighbor['Name'] = df_neighbor['Name'].str.replace('\n','')
df_neighbor['RelativeLocation'] = df_neighbor['RelativeLocation'].str.replace('\n','')

df_neighbor.head()

Unnamed: 0,Name,RelativeLocation
1,Willowbrook,Northwest
2,Greater Greenspoint,North
3,Carverdale,Northwest
4,Fairbanks / Northwest Crossing,Northwest
5,Greater Inwood,Northwest


In [None]:
#### Get Lat/Longitude of Neighborhood

### Step 3: Get Lat/Longitude of all restaurant 

In [305]:
from geopy.geocoders import Nominatim
address = '401 McIlhenny St,Houston,TX'
geolocator = Nominatim()


  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Willowbrook are 29.74966, -95.37813.


In [349]:
df2=df1
df2['Latitude']=''
df2['Longitude']=''


In [354]:
for i in list(range(len(df2))):
    print(i)
    address=df2.iloc[i]['address']+',Houston,TX'
    geolocator = Nominatim()
    location = geolocator.geocode(address)
    if location!= None:
        
        latitude = location.latitude
        longitude = location.longitude
        df2.ix[i,'Latitude']=location.latitude
        df2.ix[i,'Longitude']=location.longitude

750


  after removing the cwd from sys.path.


751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785


In [378]:
df2.head()

Unnamed: 0,business_name,rank,review_count,categories,rating,address,price_range,url,Latitude,Longitude
0,BCK Kitchen & Cocktail Adventures,1,453,"American (New),American (Traditional),Cocktail...",4.5,933 Studewood St,$$,https://www.yelp.com/biz/bck-kitchen-and-cockt...,29.7887,-95.388
1,Ritual,2,1087,"Southern,American (New)",4.0,602 Studewood St,$$,https://www.yelp.com/biz/ritual-houston?osq=Re...,29.7818,-95.3876
2,The Pit Room,3,1135,Barbeque,4.5,1201 Richmond Ave,$$,https://www.yelp.com/biz/the-pit-room-houston?...,29.7344,-95.3938
3,The Breakfast Klub,4,3704,"Breakfast & Brunch,Southern",4.5,3711 Travis St,$$,https://www.yelp.com/biz/the-breakfast-klub-ho...,29.7385,-95.3804
4,Elliot’s Table,5,38,American (New),4.0,465 T C Jester Blvd,,https://www.yelp.com/biz/elliots-table-houston...,29.7795,-95.4182


### Step 4: Create Map of Houston

In [380]:
address = 'Houston, TX'
geolocator = Nominatim()
location = geolocator.geocode(address)
#latitude = 43.6532
#longitude = -79.3832
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Houston are {}, {}.'.format(latitude, longitude))


  


The geograpical coordinate of Houston are 29.7589382, -95.3676974.


In [591]:

# create map of Toronto using latitude and longitude values
import folium
from geopy.geocoders import Nominatim

map_Houston = folium.Map(location=[latitude, longitude], zoom_start=10)
map_Houston

#### Plot Neighbourhood on Map

In [592]:
for lat, lng, name in zip(df_neighbor['Latitude'], df_neighbor['Longitude'], df_neighbor['Name']):
    if lat != '':
        label = '{}'.format(name)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=20,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.5,
            ).add_to(map_Houston)  
    
map_Houston

#### Get most popular restaurant 

In [542]:
df_mostpop_rest=df2.loc[(df2['rating']>=4) & (df2['review_count']>=100)].sort_values(['rating','review_count'],ascending=[False,False])
df_mostpop_rest=df_mostpop_rest[df_mostpop_rest['Latitude']!='']
df_mostpop_rest

Unnamed: 0,business_name,rank,review_count,categories,rating,address,price_range,url,Latitude,Longitude
76,Diced Poke,89,287,"Hawaiian,Poke",5.0,917 Winbern St,$$,https://www.yelp.com/biz/diced-poke-houston?os...,29.739,-95.3802
191,Local Poke,278,131,"Poke,Hawaiian",5.0,4500 Washington Ave,$$,https://www.yelp.com/biz/local-poke-houston-2?...,29.7706,-95.4063
3,The Breakfast Klub,4,3704,"Breakfast & Brunch,Southern",4.5,3711 Travis St,$$,https://www.yelp.com/biz/the-breakfast-klub-ho...,29.7385,-95.3804
14,JINYA Ramen Bar,15,2540,Ramen,4.5,3201 Louisiana St,$$,https://www.yelp.com/biz/jinya-ramen-bar-houst...,29.743,-95.3797
61,Uchi,67,1697,"Sushi Bars,Japanese",4.5,904 Westheimer Rd,$$$,https://www.yelp.com/biz/uchi-houston?osq=Rest...,29.745,-95.3907
19,Peli Peli,21,1399,"South African,American (New)",4.5,5085 Westheimer Rd,$$$,https://www.yelp.com/biz/peli-peli-houston-5?o...,29.7397,-95.462
2,The Pit Room,3,1135,Barbeque,4.5,1201 Richmond Ave,$$,https://www.yelp.com/biz/the-pit-room-houston?...,29.7344,-95.3938
5,Bosscat Kitchen & Libations,6,1008,"American (New),Whiskey Bars,Southern",4.5,4310 Westheimer Rd,$$,https://www.yelp.com/biz/bosscat-kitchen-and-l...,29.738,-95.4784
24,Stanton’s City Bites,26,987,"Burgers,American (Traditional),Sandwiches",4.5,1420 Edwards St,$,https://www.yelp.com/biz/stantons-city-bites-h...,29.7709,-95.3721
36,Steak 48,41,823,"Steakhouses,Seafood,Bars",4.5,4444 Westheimer Rd,$$$$,https://www.yelp.com/biz/steak-48-houston-2?os...,29.7435,-95.4536


#### Assign Each Restaurant to Neighbor based on Nearest Neighbor 

In [578]:
import numpy as np
NeighCord=df_neighbor[['Latitude','Longitude']].values
df_mostpop_rest['Neighbor']=''

for i in list(range(len(df_mostpop_rest))):
    test=df_mostpop_rest.iloc[i][['Latitude','Longitude']].values
    distDiff=(test-NeighCord)

    distMatrix=np.sum(distDiff*distDiff,axis=1)
    closestIndex=np.argmin(distMatrix)
    thisNeighbor=df_neighbor.iloc[closestIndex]['Name']
    df_mostpop_rest.iloc[i, df_mostpop_rest.columns.get_loc('Neighbor')] = thisNeighbor
    

df_mostpop_rest.dropna(inplace=True)
df_mostpop_rest

#### Map Most Popular Restaurant 

In [593]:
for lat, lng, name, rating in zip(df_mostpop_rest['Latitude'], df_mostpop_rest['Longitude'], df_mostpop_rest['business_name'],df2['rating']):
    if lat != '':
        label = '{}'.format(name)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=1,
            popup=label,
            color='green',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=rating/5,
            ).add_to(map_Houston)
map_Houston

In [586]:
df_mostpop_rest.Neighbor.value_counts()


Sunnyside                                      82
Pecan Park                                     59
East Little York / Homestead                   46
Sharpstown                                     41
Central Northwest (formerly Near Northwest)    26
Greater Heights                                25
Downtown                                       17
Medical Center                                 11
Midtown                                         8
Neartown / Montrose                             5
Braeburn                                        4
Lazybrook / Timbergrove                         3
Greater Hobby Area                              3
Spring Branch West                              3
Greenway / Upper Kirby                          2
Golfcrest / Bellfort / Reveille                 2
Willowbrook                                     2
Clear Lake                                      1
East Houston                                    1
Addicks / Park Ten                              1
