In [68]:
"""
generateGtfsFile.ipynb
preparation.json
preparation.pickle
をもとに.gtfsデータを構築する
"""

import pandas as pd
import requests
from bs4 import BeautifulSoup

from tqdm import tqdm
import os
import pickle
import uuid
import time
import json
import re

from pathlib import Path

import geopandas as gpd

from fuzzywuzzy import process

"""
FUNCTION
"""
def json_to_dict(dir, encoding_="utf-8"):
    """
    @dir        : 読み込み先ファイルパス
    @encoding_  : 読み込むファイルのエンコードによって適宜変える。
    """
    with open(dir, mode="rt", encoding=encoding_) as f:
        data = json.load(f)		# JSONのファイル内容をdictに変換する。
        return data
      
def add_set(added_element,set_:set):
    """
    @added_element:追加される要素
    @added_set:追加されるセット
    """
    before_size = len(set_)
    set_.add(added_element)
    after_size = len(set_)
    return before_size != after_size

In [21]:
"""
INPUT
"""
# 作業ディレクトリの定義
DIR_WRITING = "C:/lab/gtfses/Ishigaki_pj"
# 基本情報のjsonを取得
BASIC_INFO = json_to_dict(DIR_WRITING + "/material.json")
BASIC_INFO
# バス会社のagency idの取得
agencyId = str(BASIC_INFO["URL_BUS_COMPANY_PAGE"].split("/")[-2])

In [24]:
"""
gtfsフォルダの作成
"""
PathToGdfs = DIR_WRITING+"/" + agencyId +".gtfs"
Path(PathToGdfs).mkdir(exist_ok=True)

In [23]:
"""
agency.txtの作成
"""

"""agency_id"""
agency_id = agencyId

"""agency_name"""
response = requests.get(BASIC_INFO["URL_BUS_COMPANY_PAGE"])
agency_name = BeautifulSoup(response.text,"html.parser").find("h1",class_="main-title").text.split(u"\xa0")[0]

"""agency_url"""
agency_url = BASIC_INFO["URL_BUS_COMPANY_PAGE"]

"""agency_timezone"""
agency_timezone = "Asia/Tokyo"

"""agency_lang"""
agency_lang = "ja"

""".txtを作成"""
pd.DataFrame({"agency_id":[agency_id],"agency_name":[agency_name],"agency_url":[agency_url],"agency_timezone":[agency_timezone],"agency_lang":[agency_lang]}).to_csv(PathToGdfs+"/agency.txt",index=False,sep=",")


In [26]:
"""
stops.txtの作成
"""
column_dict = {
    "nameBS_navitime": "stop_name",
    "id": "stop_id",
    "lon": "stop_lon",
    "lat": "stop_lat"
}
tableBS = pd.read_pickle(DIR_WRITING+"/material.pickle").rename(column_dict,axis=1).filter(items=["stop_id","stop_name","stop_lat","stop_lon"])
tableBS.to_csv(DIR_WRITING+"/"+ agencyId +".gtfs/stops.txt",index=False,sep=",")


In [71]:
infoBS = pd.read_pickle(DIR_WRITING+"/material.pickle")
url_ = "https:"+list(infoBS.at[7,"url"])[0]


In [74]:
routeNameSets = ()
html_text = requests.get(url_).text
soup = BeautifulSoup(html_text,"html.parser")
lineLists = soup.find_all("dl",class_="line-list")


In [87]:
line_dest_dict = {}
for lineList in lineLists:
    soup = BeautifulSoup(str(lineList),"html.parser")
    routeName = str(soup.find("span",class_="line-name").text)
    dests = [{dest.text:dest["href"]} for dest in soup.find_all("a", class_=False)]
    line_dest_dict[routeName] = dests

line_dest_dict

{'[9]川平リゾート線[東運輸]': [{'クラブメッド方面': '//www.navitime.co.jp/diagram/bus/00428185/00070166/1/'},
  {'バスターミナル（石垣市）方面': '//www.navitime.co.jp/diagram/bus/00428185/00070166/0/'}],
 '[10]ANAコンチネンタル経由空港線[東運輸]': [{'ＡＮＡインターコンチネンタル（石垣市）/石垣空港（バス）方面': '//www.navitime.co.jp/diagram/bus/00428185/00070167/1/'},
  {'バスターミナル（石垣市）方面': '//www.navitime.co.jp/diagram/bus/00428185/00070167/0/'}],
 '[11]米原キャンプ場線[東運輸]': [{'クラブメッド/石垣空港（バス）方面': '//www.navitime.co.jp/diagram/bus/00428185/00070170/1/'},
  {'バスターミナル（石垣市）方面': '//www.navitime.co.jp/diagram/bus/00428185/00070170/0/'}]}

In [83]:
line_dest_dict['[9]川平リゾート線[東運輸]']

['<span><a href="//www.navitime.co.jp/diagram/bus/00428185/00070166/1/">クラブメッド方面</a></span>',
 '<span><a href="//www.navitime.co.jp/diagram/bus/00428185/00070166/0/">バスターミナル（石垣市）方面</a></span>']

In [84]:
url_

'https://www.navitime.co.jp/bus/diagram/direction/00428185/'

In [46]:
text = busRoutes[0].text

# "["と"]"に囲まれた部分を消去
cleaned_text = re.sub(r'\[.*?\]', '', text)

print(cleaned_text)

平野線/平野経由伊原間線


In [None]:
"""
ダイヤのページから,timeTableへのリンクを取得
"""
dir = "https://www.navitime.co.jp"
diagram_link_list = []
for l in tqdm(targetList):
    html_text = requests.get(l).text
    soup = BeautifulSoup(html_text, 'html.parser')
    for c_dir in soup.find_all("ul", class_="timeTable"):
        time.sleep(0.01)
        diagram_link_list.append(dir+c_dir.find("a").get("href"))

In [None]:
# pickleとして辞書を保存
with open("C:/research/バス路線データ/scraping/時刻表_単一運用_全リンク.pickle", "wb") as f:
    pickle.dump(diagram_link_list, f)
# read pickle pickle
with open("C:/research/バス路線データ/scraping/時刻表_単一運用_全リンク.pickle", "rb") as f:
    timeTableLinkList = pickle.load(f)

In [None]:
"""ダイヤの取得"""
for l in tqdm(timeTableLinkList):
    html_text = requests.get(l).text
    soup = BeautifulSoup(html_text, 'html.parser')

    route_name = soup.find("h2",class_="head-txt").get_text()
    route_name = route_name.replace("/","_").replace(":","-")
    operation_name = soup.find("div",class_="head-txt-sub").get_text()
    operation_name = operation_name.replace("/","_").replace(":","-")

    if os.path.isfile(DirTimeSchedule + "/timeTable/"+route_name+"/"+operation_name+".csv"): # 同データが存在する場合は以下処理はスキップ
        continue
    else:
        station_name_list =[]
        d_time_list = []
        for element in soup.find_all("dl",class_="stops"):
            station_name_list.append(element.find("a",class_="station-name-link").text)
        for tElement in soup.find_all("dl",class_="stops"):
            d_time_list.append(tElement.find("dd",class_="time").text[:5])
        os.makedirs("C:/research/バス路線データ/scraping/timeTable/"+route_name+"/", exist_ok=True)
        pd.DataFrame([station_name_list,d_time_list]).T.set_axis([route_name, operation_name], axis=1).to_csv("C:/research/バス路線データ/scraping/timeTable/"+route_name+"/"+operation_name+".csv")