In [246]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import polars as pl
from currency_converter import CurrencyConverter
# from pathlib import Path
# from itertools import permutations

In [247]:
# this function makes scrapping from the base_url
def get_url(url, english=True):
    if english:
        r = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})
    else:
        r = requests.get(url)

    if r.status_code != 200:
        print("Invalid page!")
        return []
    else:
        data = {}
        soup = BeautifulSoup(r.content, 'html.parser')
        # data["title"] = soup.find('h1').text
        dis = soup.find('meta', {'id': 'deeplinkTrip'})
        parsed = json.loads(dis["content"])[2][1]

        # print(json.dumps(parsed, indent=4, sort_keys=True))
        return parsed

In [248]:
class Cities:
    def __init__(self):
        self.base_url = 'https://www.rome2rio.com/map/'
        try:
            pass
        except FileNotFoundError:
            print("File Not Found")
        self.routes = {}

    def scrap_routes(self, city1, city2):
        tmp_url = self.base_url + city1 + '/' + city2
        self.routes[(city1, city2)] = get_url(tmp_url)

In [249]:
class BoundedBoxes:
    def __init__(self, path: str) -> None:
        self.df = pl.read_csv(path, has_header=False, new_columns=['id', 'lat_1', 'lat_2', 'lon_1', 'lon_2'])

    def get_bb_id(self, coords: list) -> int:
        try:
            cond_1 = (coords[0] >= self.df['lat_1']) & (coords[0] <= self.df['lat_2'])
            cond_2 = (coords[1] >= self.df['lon_1']) & (coords[1] <= self.df['lon_2'])
        
            filter_df = self.df.filter(cond_1 & cond_2)
            
            return filter_df['id'][0]
        except:
            return -1       

In [250]:
class Airports:
    def __init__(self, path: str) -> None:
        self.df = pl.read_csv(path, has_header=False, new_columns=['code', 'id'])

    def get_airport_id(self, code: str):
        try:
            filter_df = self.df.filter(code.lower() == self.df['code']) 
            return filter_df['id'][0]
        except:
            return -1       

In [251]:
from_city, to_city = 'Rome', 'Oslo'

In [252]:
# extract all avaliable pathes
c = Cities()
c.scrap_routes(from_city, to_city)
pathes = c.routes[(from_city, to_city)]

In [253]:
#Unpacking nested lists from scraped json ***DON`T TOUCH***
#for i, item in enumerate(pathes[5][8][0]):
    #print(i, '\n\t', item)

In [254]:
# main data extraction code cell
#
# necessary pathes setting up
path_bboxes = "../files/csv/bbox short.csv"
path_airports = "../files/csv/airport codes short.csv"

output_path = "../files/output/csv_output/"
output_csv_file = output_path + f'{from_city}-{to_city}' + '.csv'

# create class instances
bb = BoundedBoxes(path_bboxes)
ap = Airports(path_airports)
cc = CurrencyConverter()

# default currency setting
DEFAULT_CUR = 'EUR'

# main data dictionary structure set up
data = {'path_id':[],
        'path_name':[],
        'from_node':[],
        'to_node':[], 
        'from_id':[], 
        'to_id':[], 
        'transport':[],
        'transport_id':[], 
        'from_airport':[], 
        'to_airport':[],
        'from_airport_id':[], 
        'to_airport_id':[],
        'price_EUR':[],
        #'currency':[], # this key is may be unnecessary
        'price_local':[], 'currency_local':[],
        'distance_km':[], 
        'duration_min':[]}

# transport codes manually set up
transport_id = {'flight': 0, 'train': 1, 'bus': 2, 'car': 3, 'ferry': 4}

# extraction all direct routes from all pathes and filling the main data dictionary
for path_id, path in enumerate(pathes):
    for route in path[8][:-1]:
        
        if route[0] == 'flight': # for flights only
            data['path_id'].append(path_id)
            data['path_name'].append(path[4])
            data['from_node'].append(route[2][1])
            data['to_node'].append(route[3][1])
            data['from_id'].append(bb.get_bb_id(route[2][2:4]))
            data['to_id'].append(bb.get_bb_id(route[3][2:4]))
            data['transport'].append(route[0])
            data['transport_id'].append(transport_id['flight'])
            data['from_airport'].append(route[2][0])
            data['to_airport'].append(route[3][0])
            data['from_airport_id'].append(ap.get_airport_id(route[2][0]))
            data['to_airport_id'].append(ap.get_airport_id(route[3][0]))
            if route[11][0][1] not in (DEFAULT_CUR, ''): 
                data['price_EUR'].append(int(cc.convert(route[11][0][0], route[11][0][1], DEFAULT_CUR)))
            else:
                data['price_EUR'].append(route[11][0][0])
            #data['currency'].append(DEFAULT_CUR)
            data['price_local'].append('')
            data['currency_local'].append('')
            data['distance_km'].append('')
            data['duration_min'].append(int(route[4] / 60)) # sec to min
            
        elif route[1] in ('train', 'bus', 'car', 'ferry'): # for main types of vehicles
            data['path_id'].append(path_id)
            data['path_name'].append(path[4])
            data['from_node'].append(route[6][1])
            data['to_node'].append(route[7][1])
            data['from_id'].append(bb.get_bb_id(route[6][2:4]))
            data['to_id'].append(bb.get_bb_id(route[7][2:4]))
            data['transport'].append(route[1])
            data['transport_id'].append(transport_id[route[1]])
            data['from_airport'].append('')
            data['to_airport'].append('')
            data['from_airport_id'].append('')
            data['to_airport_id'].append('')
            if route[13][0][1] not in (DEFAULT_CUR, ''):
                data['price_EUR'].append(int(cc.convert(route[13][0][0], route[13][0][1], DEFAULT_CUR)))
            else:
                data['price_EUR'].append(route[13][0][0])
            #data['currency'].append(DEFAULT_CUR)           # may be unnecessary
            data['price_local'].append(route[14][0][0])
            data['currency_local'].append(route[14][0][1])
            data['distance_km'].append(int(route[5]))
            data['duration_min'].append(int(route[3] / 60)) # sec to min
                        
tmp_df = pd.DataFrame(data)

# uncomment line below to make output in folder "../files/output/csv_output/"
tmp_df.to_csv(output_csv_file)

print(tmp_df)

    path_id                            path_name                 from_node  \
0         0              Fly Rome to Oslo, train              Roma Termini   
1         0              Fly Rome to Oslo, train                      Rome   
2         0              Fly Rome to Oslo, train     Oslo lufthavn stasjon   
3         1   Fly Rome to Oslo Sandefjord, train              Roma Termini   
4         1   Fly Rome to Oslo Sandefjord, train                      Rome   
5         1   Fly Rome to Oslo Sandefjord, train  Sandefjord lufthavn Torp   
6         1   Fly Rome to Oslo Sandefjord, train              Torp stasjon   
7         2  Train to Naples, fly to Oslo, train              Roma Termini   
8         2  Train to Naples, fly to Oslo, train             Bus Metropark   
9         2  Train to Naples, fly to Oslo, train                    Naples   
10        2  Train to Naples, fly to Oslo, train     Oslo lufthavn stasjon   
11        3                                Train              Ro