# Google検索を流用
- GoogleAPI利用に上限があるため、手組で検索部分のスクレイピングを実施

# 観光地の検索用

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import json
import pandas as pd
import urllib.parse
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [2]:
# 任意の1ページ分を対象にGoogle検索実施
def _get_pages(query, page=0):
    # https://www.google.com/search?q=justin+bieber&start=20
    base_url = "https://www.google.com/search"
    # ↓変更可能 urllib.parse.quote_plus()
    query = query.replace(" ","+")
    query = query.replace("　","+")
    search_url = base_url + "?q="+ query + "&start=" + str(page*10)
    
    if (page==0):
        print(search_url) 

    response = requests.get(search_url)
    return response

In [3]:
# responseからヒット件数を抽出
def _get_num_of_hits(response):
    soup = bs(response.text, "html.parser")
    result_stats = soup.find("div", id="resultStats")
    num_of_hits = int(result_stats.text[1:-1].replace(",",""))
    
    return num_of_hits

In [4]:
# 任意の1ページ分を対象に見出しとURLを取得
def _extract_link_info(response):
    soup = bs(response.text, "html.parser")
    headings = soup.find_all("div", class_="g")

    result = []
    for i in range(len(headings)):
        if(headings[i].a== None or headings[i].h3.a == None):
            break
        title = headings[i].a.text
        tmp_url = headings[i].h3.a['href']
        url = re.sub(r'/url\?q=|&sa.*','', tmp_url)
        
        if(headings[i].span!=None):
            description = headings[i].find("span", class_="st").text
        else:
            description = "No description"

        dic = {"title": title, "url": url, "description": description}
        result.append(dic)
    
    return result

In [5]:
# 検索対象数指定(default page:0, ヒット数を上限とする)
def get_any_pages(query, page=0):

    # ヒット件数
    response = _get_pages(query, page=0)        
    num_of_hits = _get_num_of_hits(response)
    
    # 検索結果
    result = _extract_link_info(response)
    for i in range(1,page):
        if(i*10 > num_of_hits):
            break
        response = _get_pages(query, page=i)    
        next_page_result = _extract_link_info(response)
        result.extend(next_page_result)
    
    print("Search(Try) {} / Hit {}".format(i*10, num_of_hits))
    result_json = json.dumps(result, ensure_ascii=False)

    return result_json

In [6]:
def _get_imgs(query, page=0, purpose="tour"):
    base_url = "https://www.google.com/search"
    query = query.replace(" ","+")
    query = query.replace("　","+")

    if(purpose=="tour"):
    # defaultで +観光 を付与して画像検索
        search_url = base_url + "?q="+ query + "+観光" + "&tbm=isch" + "&start=" + str(page*20)
    else:
        search_url = base_url + "?q="+ query + "&tbm=isch" + "&start=" + str(page*20)

    if (page==0):
        print(search_url)

    response = requests.get(search_url)
    return response

In [7]:
# 任意の1ページ分を対象に見出しとURLを取得
def _extract_imgs_info(response):

    result = []

    soup = bs(response.text, "html.parser")
    for img in soup.find_all("img"):
        if(img == None or img.find_parent("a") == None):
            break
        #title = 
        thumbnail_url = img["src"]
        tmp_url = img.find_parent("a")["href"]
        tmp_url = re.sub(r'/url\?q=|&sa.*','', tmp_url)
        org_url = urllib.parse.unquote(tmp_url)

        if(img.find_parent().find_parent()!=None):
            description = img.find_parent().find_parent().text 
        else:
            description = "No description"

        dic = {"title": description, "thumbnail": thumbnail_url, "url": org_url, "description": description}
        #dic = {"title": title, "url": url, "description": description}
        result.append(dic)
    
    return result

In [8]:
# 検索対象数指定(default page:0, ヒット数を上限とする)
def get_any_imgs(query, page=0, purpose="tour"):

    response = _get_imgs(query, page=0, purpose=purpose)

    # 検索結果
    result = _extract_imgs_info(response)
    for i in range(1,page):
        response = _get_imgs(query, page=i, purpose=purpose)
        next_page_result = _extract_imgs_info(response)
        result.extend(next_page_result)
    
    result_json = json.dumps(result, ensure_ascii=False)

    return result_json

In [9]:
def save_json_to_csv(result_json, query, cols, opt=""):
    df = pd.read_json(result_json)
    df = df[cols]
    query = query.replace(" ","+")
    query = query.replace("　","+")
    
    if(opt!=""):
        df.to_csv(query + "_" + opt + ".csv")        
    else:
        df.to_csv(query + ".csv")

In [10]:
def show_titles(result_json, target="title"):
    # target = title, description or url
    title_list = [j[target] for j in json.loads(result_json)]
    return title_list

# 一般検索
- 簡単な実装のためには特定のサービスから検索だが、汎用性のある検索への拡張のためGoogle検索から調査
    - https://www.jalan.net/
    - https://gurutabi.gnavi.co.jp/
    - https://iko-yo.net/

In [11]:
query = "東京　観光地"
result_json = get_any_pages(query, page=5)
save_json_to_csv(result_json, query, ["title", "description", "url"])

https://www.google.com/search?q=東京+観光地&start=0
Search(Try) 40 / Hit 176000000


In [12]:
show_titles(result_json, "title")[:5]

['東京観光おすすめスポット65選！名所 穴場 最新スポット完全網羅 | 楽天 ...',
 '東京の観光スポットランキングTOP10 - じゃらんnet',
 '東京の観光地・穴場スポット40選 | Holiday [ホリデー]',
 '東京観光はコレ! 専門家おすすめ 外国人にも人気なスポット30選 | LINE ...',
 '東京観光おすすめレジャー50選！絶対行きたい観光地の決定版はコレ']

# 画像検索
- しおりに貼り付ける挿絵用

In [13]:
query = "東京　観光地"
result_json = get_any_imgs(query, page=3, purpose=" ")
save_json_to_csv(result_json, query, ["title", "description", "thumbnail", "url"], opt="img")

https://www.google.com/search?q=東京+観光地&tbm=isch&start=0


In [14]:
show_titles(result_json, "title")[:5]

['travelbook.co.jp東京観光のおすすめスポット６５：名所ランキング上位の観光地一覧 ...700×467 - 204 k\xa0-\xa0jpeg',
 'tabichannel.com東京の人気観光名所72選！デートから旅行まで楽しめる東京観光地を ...800×600 - 172 k\xa0-\xa0jpg',
 'travelbook.co.jp東京観光のおすすめスポット６５：名所ランキング上位の観光地一覧 ...700×487 - 210 k\xa0-\xa0jpeg',
 'travel.rakuten.co.jp東京観光おすすめスポット65選！名所 穴場 最新スポット完全網羅 ...473×315 - 47 k\xa0-\xa0jpg',
 'rurubu.com東京観光で行くべき超定番！観光スポット20選：るるぶ.com1030×688 - 173 k\xa0-\xa0jpg']

# Google Maps
- 指定した観光地間の移動時間をリストアップ
- 可能であれば経路を取得したい
- 例： 自宅 ～ 特定の観光地 ～ 次の観光地 ～ 自宅

- ref: https://developers.google.com/maps/documentation/urls/guide#directions-action

In [15]:
def get_route_info(key_path, origin, destination, travelmode="", output_format="json"):
# https://maps.googleapis.com/maps/api/directions/json?origin=東京駅&destination=スカイツリー&key=xxxxx
    route_info_base_url = "https://maps.googleapis.com/maps/api/directions/" 

    with open(key_path, "r") as f:
        key = f.read()

    if travelmode in ["driving", "transit", "walking", "bicycling"]:
        route_info_url = route_info_base_url + output_format + "?origin=" + origin + "&destination=" + destination + "&travelmode=" + travelmode + "&key=" + key
    else:
        route_info_url = route_info_base_url + output_format + "?origin=" + origin + "&destination=" + destination + "&key=" + key
    
    print(route_info_url.split("key=")[0]+"key=XXX")
    response = requests.get(route_info_url)
    return response

In [16]:
def extract_travel_time_and_transportations(route_info):    
    travel_time = route_info.json()["routes"][0]["legs"][0]["duration"]["text"]
    transportations = list(set([s["travel_mode"] for s in route_info.json()["routes"][0]["legs"][0]["steps"]]))
    return travel_time, transportations

In [17]:
def get_map_url(origin, destination, travelmode=""):
    # GUIでresponse
    # マップのスクショ候補： https://kiito.hatenablog.com/entry/2018/12/05/081150
    map_base_url = "https://www.google.com/maps/dir/?api=1"
    map_url = map_base_url + "&origin=" + origin +"&destination=" + destination
    if travelmode in ["driving", "transit", "walking", "bicycling"]:
        map_url += "&travelmode=" + travelmode

    print(map_url)
    response = requests.get(map_url)
    return map_url

In [30]:
def download_screenshot(url, save_filename):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    # width = driver.execute_script("return document.body.scrollWidth;")
    # height = driver.execute_script("return document.body.scrollHeight;")
    # driver.set_window_size(width, height)
    driver.set_window_size(1920, 1080)
    time.sleep(2)
    driver.save_screenshot(save_filename)

# Googlemap用のユーザ変数を定義

In [22]:
key_path = "../cert/googlemaps.key"
origin = "東京駅"
destination =  "スカイツリー"
travelmode = "walking"  # walking, driving, transit (未指定の場合自動判定される)

# travelmodeが反映されない問題

In [24]:
route_info = get_route_info(key_path, origin, destination, travelmode=travelmode)
travel_time, transportations = extract_travel_time_and_transportations(route_info)
print("{} -> {} ({} / {})".format(origin, destination, travel_time , transportations))

https://maps.googleapis.com/maps/api/directions/json?origin=東京駅&destination=スカイツリー&travelmode=walking&key=XXX
東京駅 -> スカイツリー (18 mins / ['DRIVING'])


In [31]:
map_url = get_map_url(origin, destination, travelmode=travelmode)
download_screenshot(map_url, origin+"-"+destination+".png")

https://www.google.com/maps/dir/?api=1&origin=東京駅&destination=スカイツリー&travelmode=walking


# テストエリア

# 既存ライブラリでできる疑惑

In [340]:
from googlesearch import search

In [344]:
def google_search(query, limit=10):
    for url in search(query, limit):
        print(url)

In [None]:
google_search("search")