## Scraping Restaurants Data in London

In [18]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from urllib.parse import urljoin

In [2]:
website = 'https://www.yellowpages.ca/search/si/1/Restaurants/London+ON'

In [3]:
response = requests.get(website)

In [4]:
print(response)

<Response [200]>


### Results

In [5]:
soup = BeautifulSoup(response.content, 'html.parser')
soup

<!DOCTYPE HTML>

<html class="no-js critical-off" lang="en-CA" xml:lang="en-CA" xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://ogp.me/ns/fb#">
<head>
<script type="text/javascript">(window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:["bam.nr-data.net"]},distributed_tracing:{enabled:true}};(window.NREUM||(NREUM={})).loader_config={agentID:"498849646",accountID:"824834",trustKey:"45968",xpid:"XAQDWVVXGwcJXFlSAgQPUg==",licenseKey:"d901b4dcbe",applicationID:"498805584"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var o=e[n]={exports:{}};t[n][0].call(o.exports,function(e){var o=t[n][1][e];return r(o||e)},o,o.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<n.length;o++)r(n[o]);return r}({1:[function(t,e,n){function r(t){try{s.console&&console.log(t)}catch(e){}}var o,i=t("ee"),a=t(31),s={};try{o=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof console

In [78]:
results = soup.find_all('div', {'class':"listing"})
len(results)

35

In [79]:
#name
results[0].img['alt']

'Jewel Of India Restaurant Inc - Restaurants'

In [82]:
#address
results[0].find('span', {'class':"jsMapBubbleAddress"}).get_text()

'390 Richmond St'

In [85]:
#rating
results[0].find('a', {'class': "listing__ratings__count listing__link"}).get_text().strip()

'(17)'

In [96]:
#href
href = results[0].find('a', {'class': "listing__name--link listing__link jsListingName"})['href']
href

'/bus/Ontario/London/Jewel-Of-India-Restaurant-Inc/856752.html?what=Restaurants&where=London+ON&useContext=true'

In [4]:
#url
url = 'https://www.yellowpages.ca'

### Page about current restaurant


In [101]:
#link
link = urljoin(url, href)
link

'https://www.yellowpages.ca/bus/Ontario/London/Jewel-Of-India-Restaurant-Inc/856752.html?what=Restaurants&where=London+ON&useContext=true'

In [102]:
#response
response_2 = requests.get(link)

In [103]:
print(response_2)

<Response [200]>


In [122]:
soup2 = BeautifulSoup(response_2.content, 'html.parser')
soup2

<!DOCTYPE HTML>

<html class="no-js critical-off C00" lang="en-CA" xml:lang="en-CA" xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://ogp.me/ns/fb#">
<head>
<script type="text/javascript">(window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:["bam.nr-data.net"]},distributed_tracing:{enabled:true}};(window.NREUM||(NREUM={})).loader_config={agentID:"498849646",accountID:"824834",trustKey:"45968",xpid:"XAQDWVVXGwcJXFlSAgQPUg==",licenseKey:"d901b4dcbe",applicationID:"498805584"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var o=e[n]={exports:{}};t[n][0].call(o.exports,function(e){var o=t[n][1][e];return r(o||e)},o,o.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<n.length;o++)r(n[o]);return r}({1:[function(t,e,n){function r(t){try{s.console&&console.log(t)}catch(e){}}var o,i=t("ee"),a=t(31),s={};try{o=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof con

In [121]:
#article
article = soup2.find('article', {'class':"merchant__item merchant__teaser"}).get_text().strip()
article

'Take your taste buds on a trip around the world with Jewel of India Restaurant Inc. Located in London, we serve dishes such as biryani, dupiaza, dansak, tandoori, vindaloo and much more. Finish your meal with one of our succulent and signature Indian desserts. To reserve a table, call us today.'

### First page

In [123]:
names = []
addresses = []
ratings = []
hrefs = []

for item in results:
    try:
        names.append(item.img['alt'])
    except:
        names.append('n/a')
        
    try:
        addresses.append(item.find('span', {'class':"jsMapBubbleAddress"}).get_text())
    except:
        addresses.append('n/a')
        
    try:
        ratings.append(item.find('a', {'class': "listing__ratings__count listing__link"}).get_text().strip())
    except:
        ratings.append('n/a')
    
    try:
        hrefs.append(item.find('a', {'class': "listing__name--link listing__link jsListingName"})['href'])
    except:
        hrefs.append('n/a')

In [124]:
links = []
for href in hrefs:
    try:       
        links.append(urljoin(url, href))
    except:
        links.append('n/a')

In [128]:
restaurants_page1 = pd.DataFrame({'Names': names, 'Addresses' : addresses, 'Ratings' : ratings, 
                                  'Links' : links})

In [129]:
restaurants_page1.head()

Unnamed: 0,Names,Addresses,Ratings,Links
0,Jewel Of India Restaurant Inc - Restaurants,390 Richmond St,(17),https://www.yellowpages.ca/bus/Ontario/London/...
1,Wille's Catering and Takeout - Restaurants,630 Dundas St,(8),https://www.yellowpages.ca/bus/Ontario/London/...
2,Little Panda Restaurant - Restaurants,389 Wharncliffe Rd S,(10),https://www.yellowpages.ca/bus/Ontario/London/...
3,Irene's Seafood - Restaurants,315A Wellington Rd,(8),https://www.yellowpages.ca/bus/Ontario/London/...
4,Michael's On The Thames - Restaurants,1 York St,(53),https://www.yellowpages.ca/bus/r/Ontario/Londo...


### Pagination

In [5]:
names_ = []
addresses_ = []
ratings_ = []
hrefs_ = []

for i in range(1,32):    
    website_ = 'https://www.yellowpages.ca/search/si/' + str(i) + '/Restaurants/London+ON'
    response_ = requests.get(website_)
    soup_ = BeautifulSoup(response_.content, 'html.parser')
    results_ = soup_.find_all('div', {'class':"listing"})
    
    for item in results_:
        try:
            names_.append(item.img['alt'])
        except:
            names_.append('n/a')
        
        try:
            addresses_.append(item.find('span', {'class':"jsMapBubbleAddress"}).get_text())
        except:
            addresses_.append('n/a')
        
        try:
            ratings_.append(item.find('a', {'class': "listing__ratings__count listing__link"}).get_text().strip())
        except:
            ratings_.append('n/a')
    
        try:
            hrefs_.append(item.find('a', {'class': "listing__name--link listing__link jsListingName"})['href'])
        except:
            hrefs_.append('n/a')

In [6]:
links_ = []
for item in hrefs_:
    try:       
        links_.append(urljoin(url, item))
    except:
        links_.append('n/a')

In [10]:
len(links_)

1065

In [7]:
articles_ = []
for item in links_: 
    try:
        response_3 = requests.get(item)
        soup_3 = BeautifulSoup(response_3.content, 'html.parser')
        articles_.append(soup_3.find('article', {'class':"merchant__item merchant__teaser"}).get_text().strip())
    except:
        articles_.append('n/a')

In [10]:
all_restaurants_London = pd.DataFrame({'Names': names_, 'Addresses' : addresses_, 'Ratings' : ratings_, 
                                  'Articles': articles_, 'Links' : links_})

In [12]:
all_restaurants_London.head()

Unnamed: 0,Names,Addresses,Ratings,Articles,Links
0,Jewel Of India Restaurant Inc - Restaurants,390 Richmond St,(17),Take your taste buds on a trip around the worl...,https://www.yellowpages.ca/bus/Ontario/London/...
1,Wille's Catering and Takeout - Restaurants,630 Dundas St,(8),Come enjoy a delightful meal at Willie's Cafe....,https://www.yellowpages.ca/bus/Ontario/London/...
2,Little Panda Restaurant - Restaurants,389 Wharncliffe Rd S,(10),Too tired to cook? Call Little Panda Restauran...,https://www.yellowpages.ca/bus/Ontario/London/...
3,Irene's Seafood - Restaurants,315A Wellington Rd,(8),English Style Fish & Chips Specializing in Hal...,https://www.yellowpages.ca/bus/Ontario/London/...
4,Michael's On The Thames - Restaurants,1 York St,(53),Enjoy superb continental cuisine in the relaxi...,https://www.yellowpages.ca/bus/r/Ontario/Londo...


### Data Cleaning

In [16]:
all_restaurants_London['Ratings'] = all_restaurants_London['Ratings'].apply(lambda x: x.replace('(',"").
                                                                            replace(')',""))
all_restaurants_London.head()

Unnamed: 0,Names,Addresses,Ratings,Articles,Links
0,Jewel Of India Restaurant Inc - Restaurants,390 Richmond St,17,Take your taste buds on a trip around the worl...,https://www.yellowpages.ca/bus/Ontario/London/...
1,Wille's Catering and Takeout - Restaurants,630 Dundas St,8,Come enjoy a delightful meal at Willie's Cafe....,https://www.yellowpages.ca/bus/Ontario/London/...
2,Little Panda Restaurant - Restaurants,389 Wharncliffe Rd S,10,Too tired to cook? Call Little Panda Restauran...,https://www.yellowpages.ca/bus/Ontario/London/...
3,Irene's Seafood - Restaurants,315A Wellington Rd,8,English Style Fish & Chips Specializing in Hal...,https://www.yellowpages.ca/bus/Ontario/London/...
4,Michael's On The Thames - Restaurants,1 York St,53,Enjoy superb continental cuisine in the relaxi...,https://www.yellowpages.ca/bus/r/Ontario/Londo...


In [23]:
all_restaurants_London.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1065 entries, 0 to 1064
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Names      1065 non-null   object
 1   Addresses  1065 non-null   object
 2   Ratings    1065 non-null   object
 3   Articles   1065 non-null   object
 4   Links      1065 non-null   object
dtypes: object(5)
memory usage: 41.7+ KB


In [27]:
my_data = all_restaurants_London.replace('n/a', np.nan)
my_data = my_data.dropna(how='any', axis=0, thresh=3)
my_data.head()

Unnamed: 0,Names,Addresses,Ratings,Articles,Links
0,Jewel Of India Restaurant Inc - Restaurants,390 Richmond St,17,Take your taste buds on a trip around the worl...,https://www.yellowpages.ca/bus/Ontario/London/...
1,Wille's Catering and Takeout - Restaurants,630 Dundas St,8,Come enjoy a delightful meal at Willie's Cafe....,https://www.yellowpages.ca/bus/Ontario/London/...
2,Little Panda Restaurant - Restaurants,389 Wharncliffe Rd S,10,Too tired to cook? Call Little Panda Restauran...,https://www.yellowpages.ca/bus/Ontario/London/...
3,Irene's Seafood - Restaurants,315A Wellington Rd,8,English Style Fish & Chips Specializing in Hal...,https://www.yellowpages.ca/bus/Ontario/London/...
4,Michael's On The Thames - Restaurants,1 York St,53,Enjoy superb continental cuisine in the relaxi...,https://www.yellowpages.ca/bus/r/Ontario/Londo...


In [28]:
my_data.tail()

Unnamed: 0,Names,Addresses,Ratings,Articles,Links
1051,Momos at the Market - Restaurants,London,,,https://www.yellowpages.ca/bus/Ontario/London/...
1053,Wok Box Fresh Asian Kitchen - Restaurants,3099 Wonderland Rd S,,,https://www.yellowpages.ca/bus/Ontario/Wok-Box...
1054,Kelsey's Restaurant - Restaurants,900 Oxford St East,,,https://www.yellowpages.ca/bus/Ontario/Kelsey-...
1056,Crabby's BBQ Shack - Restaurants,,,Come enjoy a delightful creation at Crabby's B...,https://www.yellowpages.ca/bus/Ontario/Crabby-...
1058,Shiny Panes - Restaurants,London,,,https://www.yellowpages.ca/bus/Ontario/London/...


In [29]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1029 entries, 0 to 1058
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Names      1029 non-null   object
 1   Addresses  1022 non-null   object
 2   Ratings    308 non-null    object
 3   Articles   627 non-null    object
 4   Links      1029 non-null   object
dtypes: object(5)
memory usage: 48.2+ KB


In [30]:
my_data.to_excel('all_restaurants_London.xlsx', index = False)