In [None]:
import re
from time import sleep
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
area_url = 'https://tabelog.com/tokyo/A1308/'

In [None]:
def get_shop_info(url):
    
    """
    Function to crawling and scraping Tabelog data.
     
    input: Tabelog restaurant url(url)
    output:List of restaurant information(res)
    """
    
    html = requests.get(url)
    sleep(1)
    
    # status code check
    if html.status_code != 200:
        return None
    else:
        soup = BeautifulSoup(html.text, 'lxml')
        
        # Extract json-like object (I can't transform it to json)
        try:
            pre_json = soup.find_all('script', {'type':'application/ld+json'})[0]
        except IndexError:
            return None
        
        pre_json = str(pre_json)[40:-14]
        
        
        # Extract information from json-like object
        try:
            url = (re.search('@id(.*?),', pre_json)[0])[6:-3]
        except Exception:
            url = None


        try:
            name = (re.search('name(.*?),', pre_json)[0])[7:-2]
        except Exception:
            name = None


        try:
            latitude = float((re.search('latitude(.*?),', pre_json)[0])[10:-2])
        except Exception:
            latitude = None


        try:
            longitude = float((re.search('longitude(.*?),', pre_json)[0])[11:-2])
        except Exception:
            longitude = None


        try:
            tel = (re.search('telephone(.*?),', pre_json)[0])[12:-2]
        except Exception:
            tel = None


        try:
            price_range = (re.search('priceRange(.*?)s', pre_json)[0])[13:-4]
        except Exception:
            price_range = None


        try:
            genre = (re.search('servesCuisine(.*?),', pre_json)[0])[16:-2]
        except Exception:
            genre = None


        try:
            score = float((re.search('ratingValue(.*?)}', pre_json)[0])[14:-2])
        except Exception:
            score = 0.0


        try:
            prefecture = (re.search('addressRegion(.*?),', pre_json)[0])[16:-2]
            city = (re.search('addressLocality(.*?),', pre_json)[0])[18:-2]
            street = ((re.search('streetAddress(.*?),', pre_json)[0])[16:-2]).replace('　', ' ')
            address = prefecture + ' ' + city + ' ' + street
        except Exception:
            address = '-'
            
            
        # from html table
        info_tables = pd.read_html(url)
        info_tables = info_tables[3:5]
        info_table = info_tables[0]
        smoking = info_tables[1]

        info_table.rename_axis({0:'point', 1: 'detail'}, axis=1, inplace=True)
        smoking.rename_axis({0:'point', 1: 'detail'}, axis=1, inplace=True)

        try:
            holiday = list(info_table[info_table['point'] == '定休日']['detail'])[0]
        except Exception:
            holiday = None


        try:
            work_hour = list(info_table[info_table['point'] == '営業時間']['detail'])[0]
        except Exception:
            work_hour = None


        try:
            transition = list(info_table[info_table['point'] == '交通手段']['detail'])[0]
        except Exception:
            transition = None


        try:
            seats = int(((list(smoking[smoking['point'] == '席数']['detail'])[0]).split()[0])[:-1])
        except Exception:
            seats = None


        try:
            smoke = list(smoking[smoking['point'] == '禁煙・喫煙']['detail'])[0]
        except Exception:
            smoke = '-'


        res = [name,
               genre,
               price_range,
               score,
               work_hour,
               holiday,
               latitude,
               longitude,
               seats,
               smoke,
               tel,
               address,
               transition,
               url]

        return res

In [None]:
# Get shop page links
# Tabelog page can get 20(restaurants/page) * 60 (page)
shop_links = []
for i in range(1, 61):
    for j in range(1,4):
        sorted_url = (area_url + area_url[-6:-1] + '0' + str(j) + '/rstLst/{}/?Srt=D&SrtT=rt&sort_mode=1').format(i)
        html = requests.get(sorted_url)
        sleep(1)

        if html.status_code == 200:

            soup = BeautifulSoup(html.text, 'lxml')
            links = [a.get('href') for a in soup.find_all('a', {'class':'list-rst__rst-name-target'})]

            shop_links.extend(links)

In [None]:
tabelog = [get_shop_info(k) for k in shop_links]

In [None]:
res = pd.DataFrame(tabelog).rename_axis({0: 'shop_name',
                                         1: 'genre',
                                         2: 'price_zone',
                                         3: 'score',
                                         4: 'available_time',
                                         5: 'closed',
                                         6: 'latitude',
                                         7: 'longitude',
                                         8: 'seats',
                                         9: 'smoking',
                                         10: 'tel',
                                         11: 'address',
                                         12: 'transition',
                                         13: 'url'}, axis=1)

In [None]:
res.to_csv('Tabelog_akasaka.csv',index=None)