In [207]:
"""
generateGtfsFile.ipynb
preparation.json
preparation.pickle
をもとに.gtfsデータを構築する
"""

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

from tqdm import tqdm
import os
import pickle
import uuid
import time
import json
import re
import zipfile
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from pathlib import Path


import geopandas as gpd

from fuzzywuzzy import process

from datetime import datetime, timedelta



"""
FUNCTION
"""
def json_to_dict(dir, encoding_="utf-8"):
    """
    @dir        : 読み込み先ファイルパス
    @encoding_  : 読み込むファイルのエンコードによって適宜変える。
    """
    with open(dir, mode="rt", encoding=encoding_) as f:
        data = json.load(f)		# JSONのファイル内容をdictに変換する。
        return data
      
def add_set(added_element,set_:set):
    """
    @added_element:追加される要素
    @added_set:追加されるセット
    """
    before_size = len(set_)
    set_.add(added_element)
    after_size = len(set_)
    return before_size != after_size

def deleteNone(dict_:list):
    return_list = []
    for diagram_link in diagram_links:
        if diagram_link["text"] != "" and diagram_link["url"] != None:
            if "始" in diagram_link["text"]:
                return_list.append(diagram_link)

    return return_list

In [180]:
"""
INPUT
"""
# 作業ディレクトリの定義
DIR_WRITING = "C:/lab/gtfses/Ishigaki_pj"
# 基本情報のjsonを取得
BASIC_INFO = json_to_dict(DIR_WRITING + "/material.json")
BASIC_INFO
# バス会社のagency idの取得
agencyId = str(BASIC_INFO["URL_BUS_COMPANY_PAGE"].split("/")[-2])
# バス停名とstop_idの辞書の作成
NAMEBS_ID_TABLE = pd.read_pickle(DIR_WRITING+"/material.pickle").filter(items=["nameBS_navitime","id"])

In [183]:
"""
gtfsフォルダの作成
"""
PathToGdfs = DIR_WRITING+"/" + agencyId +".gtfs"
Path(PathToGdfs).mkdir(exist_ok=True)

In [184]:
"""
agency.txtの作成
"""

"""agency_id"""
agency_id = agencyId

"""agency_name"""
response = requests.get(BASIC_INFO["URL_BUS_COMPANY_PAGE"])
agency_name = BeautifulSoup(response.text,"html.parser").find("h1",class_="main-title").text.split(u"\xa0")[0]

"""agency_url"""
agency_url = BASIC_INFO["URL_BUS_COMPANY_PAGE"]

"""agency_timezone"""
agency_timezone = "Asia/Tokyo"

"""agency_lang"""
agency_lang = "ja"

""".txtを作成"""
pd.DataFrame({"agency_id":[agency_id],"agency_name":[agency_name],"agency_url":[agency_url],"agency_timezone":[agency_timezone],"agency_lang":[agency_lang]}).to_csv(PathToGdfs+"/agency.txt",index=False,sep=",")


In [237]:
"""
calendar.txtの作成
"""
calendar_txt = {
    "service_id":["weekday","saturday","holiday"],
    "monday":[1,0,0],
    "tuesday":[1,0,0],
    "wednesday":[1,0,0],
    "thursday":[1,0,0],
    "friday":[1,0,0],
    "saturday":[0,1,0],
    "sunday":[0,0,1],
    "start_date":[20240401,20240401,20240401],
    "end_date":[20250401,20250401,20250401]
}

pd.DataFrame(calendar_txt).to_csv(DIR_WRITING+"/"+ agencyId +".gtfs/calendar.txt",index=False,sep=",")

In [186]:
"""
stops.txtの作成
"""
column_dict = {
    "nameBS_navitime": "stop_name",
    "id": "stop_id",
    "lon": "stop_lon",
    "lat": "stop_lat"
}
tableBS = pd.read_pickle(DIR_WRITING+"/material.pickle").rename(column_dict,axis=1).filter(items=["stop_id","stop_name","stop_lat","stop_lon"])
tableBS["zone_id"] = tableBS["stop_id"] # 運賃ルール既定のための記載
tableBS.to_csv(DIR_WRITING+"/"+ agencyId +".gtfs/stops.txt",index=False,sep=",")

In [188]:
"""
stop_times.txt,routes.txt,trips.txt作成のための共通準備
"""
infoBS = pd.read_pickle(DIR_WRITING+"/material.pickle")
urls = ["https:" + url for url in infoBS["url"].to_list()]

to_timetable_urls = []
for url_ in tqdm(urls):
    # バスの系統選択
    html_text = requests.get(url_).text
    soup = BeautifulSoup(html_text,"html.parser")
    lineLists = soup.find_all("dl",class_="line-list")

    # 時刻表画面へのurl取得
    for lineList in lineLists:
        soup = BeautifulSoup(str(lineList),"html.parser")
        routeName = str(soup.find("span",class_="line-name").text)
        to_timetable_urls += [{routeName + "%%" + dest.text:dest["href"]} for dest in soup.find_all("a", class_=False)]
    
    time.sleep(0.5)


100%|██████████| 134/134 [02:13<00:00,  1.00it/s]


In [208]:
"""ダイヤの取得"""
route_direction_diagram_links = {}

for to_timetable_url in tqdm(to_timetable_urls):

    route_direction_name, url_ = list(to_timetable_url.keys())[0],"https:" + list(to_timetable_url.values())[0]

    session = requests.Session()
    retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    session.mount('http://', HTTPAdapter(max_retries=retries))
    response = session.get(url_).text
    soup = BeautifulSoup(response,"html.parser")
    timeTables = soup.find_all("ul", class_="timeTable")

    for timeTable in timeTables:
        timeRecord = BeautifulSoup(str(timeTable),"html.parser")
        abbreviated_text = timeRecord.find("div", class_="abbreviated-name").text
        if "始" in abbreviated_text: # 始発のリンクのみ保持する。
            if route_direction_name not in route_direction_diagram_links:
                route_direction_diagram_links[route_direction_name] = []

            route_direction_diagram_links[route_direction_name] += [timeRecord.find("a")["href"]]


100%|██████████| 609/609 [09:25<00:00,  1.08it/s]


In [209]:
def judge_day(url:str):
    """
    abstract:曜日判定関数
    """
    if "year=2024&month=04&day=11" in url:
        return "weekday"
    elif "year=2024&month=04&day=13" in url:
        return "saturday"
    else:
        return "holiday"

In [229]:
all_trips = {}
miss_urls = []
for route_direction_name, urls in tqdm(list(route_direction_diagram_links.items())):
    for url_ in urls:
        session = requests.Session()
        retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
        session.mount('http://', HTTPAdapter(max_retries=retries))
        response = session.get("https://www.navitime.co.jp"+url_)
        soup = BeautifulSoup(response.text,"html.parser")
        stops = soup.find_all("dl",class_="stops")
        service_id = judge_day(url_)
        trip_id = route_direction_name + "%%" + service_id + "%%" + soup.find("div",class_="head-txt-sub").text
        dia = []

        for stop in stops:
            stop_name = BeautifulSoup(str(stop),"html.parser").find("dt",class_="station-name").text.replace("\n","")
            stop_id = BeautifulSoup(str(stop),"html.parser").find("a",class_="station-name-link")["href"].split("=")[1]
            try:
                stop_time = BeautifulSoup(str(stop),"html.parser").find("dd",class_="time").text.replace("発","").replace("着","")
            except:
                stop_time = BeautifulSoup(str(stop),"html.parser").find("dd",class_="from-to-time").text.replace("発","").replace("着","")

            dia.append({"stop_name":stop_name,"stop_id":stop_id,"stop_time":stop_time})

        if route_direction_name not in all_trips:
            all_trips[route_direction_name] = []

        all_trips[route_direction_name] += [{"trip_id":trip_id,"service_id":service_id,"diagram":dia}]

        time.sleep(0.3)

100%|██████████| 18/18 [04:15<00:00, 14.17s/it]


In [230]:
"""
route.txtの作成
"""
route_direction_names =list(set(all_trips.keys()))
route_txt = pd.DataFrame(
    {"route_id":route_direction_names,"route_short_name":route_direction_names}
)
route_txt["route_type"] = 3
route_txt.to_csv(DIR_WRITING+"/"+ agencyId +".gtfs/routes.txt",index=False,sep=",")

In [231]:
"""
trips.txtの作成
"""
trips_txt = {"route_id":[],"service_id":[],"trip_id":[]}
for route_direction_name in all_trips.keys():
    for trip in all_trips[route_direction_name]:
        trips_txt["route_id"] += [route_direction_name]
        trips_txt["trip_id"] += [trip["trip_id"]]
        trips_txt["service_id"] += [trip["service_id"]]
pd.DataFrame(trips_txt).to_csv(DIR_WRITING+"/"+ agencyId +".gtfs/trips.txt",index=False,sep=",")


In [232]:
"""
stop_times.txtの作成
"""
stop_times_txt = {"trip_id":[],"arrival_time":[],"departure_time":[],"stop_id":[],"stop_sequence":[]}
for trips in tqdm(all_trips.values()):
    for trip in trips:
        trip_id = trip["trip_id"]
        for seq,stop in enumerate(trip["diagram"]):# 各stop=record毎の処理
            stop_times_txt["trip_id"] += [trip_id]
            stop_times_txt["stop_id"] += [stop["stop_id"]]
            stop_times_txt["stop_sequence"] += [seq]

            stop_time = stop["stop_time"].replace("\n","").replace(" ","")
            arrival_time = ""
            departure_time = ""
            if len(stop_time) == 5: # 到着時刻=出発時刻
                arrival_time = stop_time
                departure_time = stop_time
            elif len(stop_time) == 10: #到着時刻not=出発時刻
                arrival_time = stop_time[:5]
                departure_time = stop_time[5:]
            else:
                print("error:時刻の記載が不正です")
                   
            stop_times_txt["arrival_time"] += [arrival_time+":00"]
            stop_times_txt["departure_time"] += [departure_time+":00"]
pd.DataFrame(stop_times_txt).astype(str).to_csv(DIR_WRITING+"/"+ agencyId +".gtfs/stop_times.txt",index=False,sep=",")

100%|██████████| 18/18 [00:00<00:00, 1373.03it/s]


In [238]:
""" 作成したファイル群のzip化 """
def zip_text_files(directory_path, output_zip_file):
    # 新しいZIPファイルを作成
    with zipfile.ZipFile(output_zip_file, 'w') as zipf:
        # 指定ディレクトリのファイルをループ
        for foldername, subfolders, filenames in os.walk(directory_path):
            for filename in filenames:
                if filename.endswith('.txt'):
                    # ファイルパスを取得
                    file_path = os.path.join(foldername, filename)
                    # ZIPファイルにファイルを追加
                    zipf.write(file_path, arcname=filename)
                    print(f'Added {filename} to {output_zip_file}')

# ディレクトリパスと出力するZIPファイルの名前
directory_path = DIR_WRITING+"/"+agencyId+".gtfs"
output_zip_file = DIR_WRITING+"/"+agencyId+".gtfs.zip"

# 関数を呼び出してZIP圧縮を実行
zip_text_files(directory_path, output_zip_file)

Added agency.txt to C:/lab/gtfses/Ishigaki_pj/00001313.gtfs.zip
Added calendar.txt to C:/lab/gtfses/Ishigaki_pj/00001313.gtfs.zip
Added routes.txt to C:/lab/gtfses/Ishigaki_pj/00001313.gtfs.zip
Added stops.txt to C:/lab/gtfses/Ishigaki_pj/00001313.gtfs.zip
Added stop_times.txt to C:/lab/gtfses/Ishigaki_pj/00001313.gtfs.zip
Added trips.txt to C:/lab/gtfses/Ishigaki_pj/00001313.gtfs.zip


In [21]:
### pickleで保存（書き出し）
with open(DIR_WRITING+'/all_trips.pickle', mode='wb') as fo:
  pickle.dump(all_trips, fo)

In [191]:
with open(DIR_WRITING+'/all_trips.pickle', mode='br') as fi:
    all_trips = pickle.load(fi)

In [71]:
route_id_pairs = {}
for route_direction_name, trips in all_trips.items():
    pairs = {}
    for trip in trips:
        stops = trip["diagram"]
        for o in range(len(stops)):
            for d in range(o+1, len(stops)):
                key_of_pair = stops[o]["stop_id"] + "-" + stops[d]["stop_id"]
                if key_of_pair not in pairs:
                    pairs[key_of_pair] = {"o":stops[o],"d":stops[d]}
    route_id_pairs[route_direction_name] = pairs


In [100]:
# Define the time format
TIME_FORMAT = "%H:%M"
# Define date
YEAR = "2024"
MONTH = "04"
DAY = "10"

# 結果格納変数
fare_attribute_txt = {"fare_id":[],"price":[],"currency_type":[],"payment_method":[],"transfers":[]}
fare_rules_txt = {"fare_id":[],"route_id":[],"origin_id":[],"destination_id":[]}
fare_accuracy_check = {"fare_id":[],"price":[],"o_stop_time":[],"rtn_start_time":[],"diff_o":[],"d_stop_time":[],"rtn_goal_time":[],"diff_d":[]}

for route_direction_name, pairs in route_id_pairs.items():
        
    for pair_id,pair in tqdm(pairs.items()):    

        # Generate Url
        o_stop_time = datetime.strptime(pair["o"]["stop_time"], TIME_FORMAT)
        d_stop_time = datetime.strptime(pair["d"]["stop_time"], TIME_FORMAT)

        url_to_get = "https://www.navitime.co.jp/bustransit/search?orvStationCode=" + pair["o"]["stop_id"] +"&dnvStationCode=" + pair["d"]["stop_id"] + "&year="+YEAR+"&month="+MONTH+"&day="+DAY+"&hour="+str(o_stop_time.hour)+"&minute="+str(o_stop_time.minute)+"&basis=1&sort=0&wspeed=standard&unuse="

        # スクレイピング
        def get_fare(url_to_get_, TIME_FORMAT_):
            response = requests.get(url_to_get_)
            if response.status_code != 200:
                print("Status Code is :" +str(response.status_code))
            soup = BeautifulSoup(response.text,"html.parser")
            infos = soup.find("li",class_="route-summary") # 路線検索結果の一番初めの項目を取得
            start_goal_time = BeautifulSoup(str(infos),"html.parser").find("dt",class_="start-goal-time")
            rtn_start_goal_times = [datetime.strptime(time, TIME_FORMAT_) for time in start_goal_time.text.replace("\n","").replace(" ","").split("\xa0⇒\xa0")] # 路線検索結果として返された出発時刻到着時刻
            fare = BeautifulSoup(str(infos),"html.parser").find("span",class_="normal-fare").text.replace("円","") # 料金の取得
            return {"fare":fare,"rtn_start_goal_times":rtn_start_goal_times}
        
        fare_info = ""
        try:
            fare_info = get_fare(url_to_get, TIME_FORMAT)
        except:
            print("Waiting 10 seconds before retrying...")
            time.sleep(10)
            try:
                fare_info = get_fare(url_to_get, TIME_FORMAT)
            except:
                print("Waiting 60 seconds before retrying...")
                time.sleep(60)
                try:
                    fare_info = get_fare(url_to_get, TIME_FORMAT)
                except:
                    fare_info = {"fare":-1,"rtn_start_goal_times":[datetime(1900, 1, 1, 1, 1), datetime(1900, 1, 1, 1, 1)]}




        """ fare_attirbute.txt向けの情報を格納 """
        fare_attribute_txt["fare_id"]           += [route_direction_name + "%%" + pair_id]
        fare_attribute_txt["price"]             += [fare_info["fare"]]
        fare_attribute_txt["currency_type"]     += ["JPY"]
        fare_attribute_txt["payment_method"]    += [0] # 0:乗車後に払う 1:乗車前に払う
        fare_attribute_txt["transfers"]         += [0] # 0:同料金での乗り継ぎを禁止
        """ fate_rules.txt向けの情報を格納 """
        fare_rules_txt["fare_id"]               += [route_direction_name + "%%" + pair_id]
        fare_rules_txt["route_id"]              += [route_direction_name]
        fare_rules_txt["origin_id"]             += [pair_id.split("-")[0]]
        fare_rules_txt["destination_id"]        += [pair_id.split("-")[1]]
        """ 算出された運賃計算の正確性をチェックするための辞書に格納 """
        fare_accuracy_check["fare_id"]          += [route_direction_name + "%%" + pair_id]
        fare_accuracy_check["price"]            += [fare_info["fare"]]
        fare_accuracy_check["o_stop_time"]      += [o_stop_time]
        fare_accuracy_check["rtn_start_time"]   += [fare_info["rtn_start_goal_times"][0]]
        fare_accuracy_check["diff_o"]           += [(fare_info["rtn_start_goal_times"][0] - o_stop_time).total_seconds() / 60]
        fare_accuracy_check["d_stop_time"]      += [d_stop_time]
        fare_accuracy_check["rtn_goal_time"]    += [fare_info["rtn_start_goal_times"][1]]
        fare_accuracy_check["diff_d"]           += [(fare_info["rtn_start_goal_times"][1] - d_stop_time).total_seconds() / 60]


        time.sleep(1)



100%|██████████| 351/351 [14:28<00:00,  2.47s/it]
100%|██████████| 423/423 [21:07<00:00,  3.00s/it]
 15%|█▌        | 123/803 [05:56<21:43,  1.92s/it]  

Waiting 10 seconds before retrying...
Waiting 60 seconds before retrying...


 15%|█▌        | 123/803 [07:08<39:28,  3.48s/it]


AttributeError: type object 'datetime.datetime' has no attribute 'datetime'

In [102]:
url_to_get

'https://www.navitime.co.jp/bustransit/search?orvStationCode=00428152&dnvStationCode=00428152&year=2024&month=04&day=10&hour=12&minute=16&basis=1&sort=0&wspeed=standard&unuse='

In [104]:
pd.DataFrame(fare_rules_txt).to_csv(DIR_WRITING+"/"+ agencyId +".gtfs/fare_rules.txt",index=False,sep=",")
pd.DataFrame(fare_attribute_txt).to_csv(DIR_WRITING+"/"+ agencyId +".gtfs/fare_attribute.txt",index=False,sep=",")

In [235]:
"""gtfsデータのバグとり"""
df = pd.read_csv(DIR_WRITING+"/"+ agencyId +".gtfs/trips.txt")
# df[df.duplicated("trip_id",keep=False)]
df.drop_duplicates("trip_id",keep="first").to_csv(DIR_WRITING+"/"+ agencyId +".gtfs/trips.txt")

In [228]:
df2 = pd.read_csv(DIR_WRITING+"/"+ agencyId +".gtfs/stop_times.txt")
df2.query("trip_id == '[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発'")

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence
3603,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:00:00,08:00:00,428137,0
3604,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:08:00,08:08:00,428229,1
3605,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:15:00,08:15:00,428230,2
3606,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:18:00,08:18:00,428231,3
3607,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:21:00,08:21:00,428138,4
3608,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:26:00,08:26:00,428139,5
3609,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:00:00,08:00:00,428137,0
3610,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:08:00,08:08:00,428229,1
3611,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:15:00,08:15:00,428230,2
3612,[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発,08:18:00,08:18:00,428231,3


In [None]:
df2.query("trip_id == '[5/6]平野線/平野経由伊原間線[東運輸]%%weekday%%伊原間駅08:00分発'")