In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

## Preparing a list of URLs to scrape

In [5]:
root_url = 'https://www.zomato.com/kolkata/dinner'

In [6]:
# Lying to the server so that we can scrape without consequences
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

Host: www.zomato.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://www.zomato.com/
DNT: 1
Connection: keep-alive
Cookie: fbcity=2; zl=en; fbtrack=eeca49e31c6b8e95478f7c58bdaba30a; dpr=1; G_ENABLED_IDPS=google; o2menub=once; PHPSESSID=8725bbaf28e3f72494900d9bcfff35597b443b04; session_id=null; csrf=ec079248a06e321a515a1df4f505a652; ak_bmsc=D033BAE9CFBA52944A4870A67FC05CDE737002A2553C00009F58445CE11C3458~plBMlXFNwHWfKAoYLODvj6TZ/Z2SHHNL2gNxtq3j1Yhnzxq1dANcxtkMK+0yQaXm8Me4mRxBvsLWJ+HdxhHNcL+bKFagsvaDeWcfcE8rZnUgzIzdhOvric7XavLTSACWY5+0I4lJqtrKrcbVoWm0g2hjGvlLTzqdYny5e72wvQu8liHPJ1HHVQE8cUtRt2dRDO5V67hVoPCdiS9zsNJoQ7kr5oixyaG6+Dqke6eHFcMgQ=; bm_sv=B7AF0848E1E49D79D3C3F06F483116B5~d30ph/ijHuWIFh36Scln38hb3DUPe+vfJEJKJzql2xdjNofIkkePFFiWd1q+FDSiHw5XH8midC2KMdXaiOXKbWNHlUwTJC8Bm1m2S1TfAhrQa3zZiOo+xckqkriF8B/pHg0nnHg/LrfO8dLYUtePRoiFx00iASTSWe5BRJ5QW4U=
Upgrade-Insecure-Requests: 1
Cache-Control: max-age=0

In [15]:
rest_urls = []
pbar = tqdm(range(1, 269))
for i in pbar:
    # Downloading page containing list of restaurants
    page = requests.get(f'{root_url}?page={i}', headers=headers)
    
    # Cooking a soup for easy digestion
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Extracting a list of all restaurants
    res = soup.find_all('div', attrs={'class': 'search_left_featured clearfix'})
    
    for entry in res:
        # Only storing the URL of the restaurant
        url = entry.find('a').get('href')
        
        # Replacing /info with /reviews so as to navigate to the reviews directly
        rest_urls.append(url.replace('/info', '/reviews'))
    pbar.set_description(f'Page [{i}] scraped!')

Page [268] scraped!: 100%|███████████████████████████████████████████████████████████| 268/268 [06:30<00:00,  1.40s/it]


In [19]:
print(f'Number of urls captured : {len(rest_urls)}')
print('Printing few urls :')
for i in rest_urls[:10]:
    print(i)

Number of urls captured : 4005
Printing few urls :
https://www.zomato.com/kolkata/the-saffron-tree-southern-avenue/reviews
https://www.zomato.com/kolkata/spice-kraft-ballygunge/reviews
https://www.zomato.com/kolkata/peter-cat-park-street-area/reviews
https://www.zomato.com/kolkata/barbeque-nation-sector-5-salt-lake/reviews
https://www.zomato.com/kolkata/whats-up-southern-avenue/reviews
https://www.zomato.com/kolkata/jw-kitchen-jw-marriott-hotel-kolkata-science-city-area/reviews
https://www.zomato.com/kolkata/barbq-park-street-area/reviews
https://www.zomato.com/kolkata/ozora-kasba/reviews
https://www.zomato.com/kolkata/carpe-diem-2-park-street-area/reviews
https://www.zomato.com/kolkata/capella-altair-boutique-hotel-sector-5-salt-lake/reviews


## Scraping reviews and corresponding ratings

In [20]:
texts = []
labels = []
pbar = tqdm(rest_urls)
for url in pbar:
    try:
        # A little verbosity doesn't hurt
        pbar.set_description(f'Fetching {url}')
        
        # Downloading the restaurant page
        page = requests.get(url, headers=headers)
        
        # Cooking a soup
        page = BeautifulSoup(page.text, 'html.parser')
        
        # Looking for reviews
        reviews = page.find_all('div', attrs={'class': 'rev-text mbot0 '})
        
        for review in reviews:
            # Extracting the label
            label = float(review.find('div')['aria-label'][-3:])
            
            # Clearing out some junk
            review.find('div').decompose()
            
            # Extracting the text
            text = review.text.strip()

            # Storing our treasures
            labels.append(label)
            texts.append(text)
            
    except KeyboardInterrupt:
        # Manually breaking the loop
        break   
    except:
        # Something went wrong so ...
        # Skipping the current url :)
        pbar.write(f'Failed to retrieve url: {url}')

# Making a pandas DataFrame from our data
data = pd.DataFrame()
data['text'] = texts
data['label'] = labels

                                                                                                                                                               

Failed to retrieve url: https://www.zomato.com/kolkata/dakshinayan-restaurant-joka/reviews


Fetching https://www.zomato.com/kolkata/taste-of-south-indian-ballygunge/reviews: 100%|█| 4005/4005 [1:04:18<00:00,  1.30it/s]                         



## Glimpse of the data

In [23]:
data.head()

Unnamed: 0,text,label
0,We visited this restaurant on my birthday for ...,5.0
1,Finally got the chance to visit this place. He...,5.0
2,"So, this is another famous place in Southern A...",5.0
3,A delight for all mughlai lovers. Went there f...,5.0
4,"Well, The Saffron Tree has now become my one o...",5.0


In [24]:
data.shape

(16910, 2)

## Saving the data to disk

In [25]:
data.to_csv('data/reviews_2.csv', index=False)