In [15]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import polars as pl
from currency_converter import CurrencyConverter, ECB_URL, SINGLE_DAY_ECB_URL
import compress_json

# from pathlib import Path
# from itertools import permutations

In [4]:
# necessary pathes setting up
BBOXES_CSV = '../files/csv/bbox short.csv'
AIRPORTS_CSV = '../files/csv/airport codes short.csv'
LOC_AIRPORTS_CSV = '../files/csv/locations with airports.csv'

PATH_CSV_OUTPUT = "../files/output/csv_output/"
PATH_JSON_OUTPUT = "../files/output/json_output/"

In [2]:
# this function makes scrapping from the base_url
def get_url(url, english=True):
    if english:
        r = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})
    else:
        r = requests.get(url)

    if r.status_code != 200:
        print("Invalid page!")
        return []
    else:
        data = {}
        soup = BeautifulSoup(r.content, 'html.parser')
        # data["title"] = soup.find('h1').text
        dis = soup.find('meta', {'id': 'deeplinkTrip'})
        parsed = json.loads(dis["content"])[2][1]

        # print(json.dumps(parsed, indent=4, sort_keys=True))
        return parsed

In [3]:
class Cities:
    def __init__(self):
        self.base_url = 'https://www.rome2rio.com/map/'
        try:
            pass
        except FileNotFoundError:
            print("File Not Found")
        self.routes = {}

    def scrap_routes(self, city1, city2):
        tmp_url = self.base_url + city1 + '/' + city2
        self.routes[(city1, city2)] = get_url(tmp_url)

In [27]:
def get_bb_id(coords: float) -> int:
        try:
            df = pl.read_csv(BBOXES_CSV, has_header=False, new_columns=['id', 'lat_1', 'lat_2', 'lon_1', 'lon_2'])
            
            cond_1 = (coords[0] >= df['lat_1']) & (coords[0] <= df['lat_2'])
            cond_2 = (coords[1] >= df['lon_1']) & (coords[1] <= df['lon_2'])
        
            filter_df = df.filter(cond_1 & cond_2)
            
            return filter_df['id'][0]
        
        except:
            return -1    

In [28]:
def get_airport_id(code: str) -> int:
    try:
        df = pl.read_csv(AIRPORTS_CSV, has_header=False, new_columns=['code', 'id'])
        filter_df = df.filter(df['code'] == code.lower())
        
        return filter_df['id'][0]
    
    except:
        return -1   

In [29]:
def get_airport_id_for_loc(location: str) -> int:
    try:
        df = pl.read_csv(LOC_AIRPORTS_CSV, has_header=True)
        filter_df = df.filter(df['city'] == location)
            
        return filter_df['id'][0]
        
    except:
        return -1     
            

In [5]:
#from_city, to_city = 'Moscow', 'Saint Petersburg'
from_city, to_city = 'Tehran', 'Lhasa'

In [18]:
from_city_id, to_city_id = 8, 9
filepath = f'{PATH_JSON_OUTPUT}/{from_city_id} - {to_city_id}.json.gz'
D1 = compress_json.load(filepath) # for loading a gzip file
print(D1)

[[0, ['node', 'Tehrān', 35.69439, 51.42151, 'IR', 'Asia/Tehran', 2.5], ['node', 'Tashkent', 41.31162, 69.27824, 'UZ', 'Asia/Tashkent', 2.5], 'Fly-from-Tehran-Imam-Khomeini', 'Fly from Tehran Imam Khomeini', 33888.103, 26238.103, 7650, [['car', 'taxi', 'yellow', 1218.103, 0, 14.06933, ['node', 'Tehrān', 35.69439, 51.42151, 'IR', 'Asia/Tehran', 2.5], ['station', 'Azadi Square', 35.68333, 51.33333, 'IR'], '', 'False', '', '', 'False', [[1, 'USD', 0.96, 'False', ''], [1, 'USD', 1.13, 'False', ''], [2, 'USD', 1.13, 'False', '']], [[17, 'TRY', 17.93, 'False', ''], [21, 'TRY', 21.19, 'False', ''], [22, 'TRY', 21.19, 'False', '']], 53, 53, [], [], [], [], '', '', []], ['transit', 'bus', 'orange', 3600, 1800, 49.21783, ['station', 'Azadi Square', 35.68333, 51.33333, 'IR', 'Asia/Tehran'], ['station', 'Tehran Imam Khomeini', 35.40898, 51.15497, 'IR', 'Asia/Tehran'], '', 'False', [3600, 49.21783, ['station', 'Azadi Square', 35.68333, 51.33333, 'IR', 'Asia/Tehran'], ['station', 'Tehran Imam Khomein

In [13]:
cc = CurrencyConverter(SINGLE_DAY_ECB_URL)
print(cc.convert(7500, 'CNY'), cc.bounds['CNY'], sep='\n')

1041.0160316468873
Bounds(first_date=datetime.date(2023, 1, 6), last_date=datetime.date(2023, 1, 6))


In [6]:
# extract all avaliable pathes
c = Cities()
c.scrap_routes(from_city, to_city)
pathes = c.routes[(from_city, to_city)]

In [8]:
#Unpacking nested lists from scraped json ***DON`T TOUCH***
for i, item in enumerate(pathes[0][8][3]):
    print(i, '\n\t', item)

0 
	 flight
1 
	 turquoise
2 
	 ['IKA', 'Tehran Imam Khomeini', '', 35.40898, 51.15497, 'Tehrān', 'Iran', 'AS', '38 to 43 flights daily', 285]
3 
	 ['LXA', 'Lhasa', '', 29.29014, 90.90083, 'Tibet Autonomous Region', 'China', 'AS', '63 to 64 flights daily', 447]
4 
	 45000
5 
	 66600
6 
	 7200
7 
	 58
8 
	 2 to 15 flights daily
9 
	 [15, 5, 15, 2, 3, 14, 4]
10 
	 True
11 
	 [[2600, 'CNY', 2622.44, 'False', ''], [4700, 'CNY', 4702.87, 'False', ''], [7500, 'CNY', 7131.18, 'False', '']]
12 
	 [['PEK', 'Beijing Capital', '', 40.07962, 116.5924, 'Beijing Municipality', 'China', 'AS', '441 to 457 flights daily', 3148]]
13 
	 []
14 
	 ['/tickets/Tehran-Imam-Khomeini-(IKA)(35.40898,51.15497)/Lhasa-(LXA)(29.29014,90.90083)?oDateTime={oDateTime}&iDateTime={iDateTime}&adults={adults}&seniors={seniors}&youths={youths}&ages={ages}&checkoutExitLabel={checkoutExitLabel}&mode=plane&requestId=181-20230108-034657-9491953', 'Tickets', 'Tickets:plane', 'Tehran-Imam-Khomeini-(IKA)(35.40898,51.15497)', 'Lhas

In [33]:
# create class instances
cc = CurrencyConverter(SINGLE_DAY_ECB_URL)
print(cc.convert(100, 'USD'))
dd = CurrencyConverter(ECB_URL)
print(dd.convert(100, 'USD'))
print(cc.bounds['USD'])
print(dd.bounds['USD'])

94.8316737790422
94.8316737790422
Bounds(first_date=datetime.date(2023, 1, 3), last_date=datetime.date(2023, 1, 3))
Bounds(first_date=datetime.date(1999, 1, 4), last_date=datetime.date(2023, 1, 3))


In [34]:

# main data extraction code cell
#
# 
csv_output_file = PATH_CSV_OUTPUT + f'{from_city}-{to_city}' + '.csv'
json_output_file = PATH_JSON_OUTPUT + f'{from_city}-{to_city}' + '.json'

# default currency setting
DEFAULT_CUR = 'EUR'

# set up the set of rare transport


# main data dictionary structure set up
data = {'path_id':[],
        'path_name':[],
        'from_node':[],
        'to_node':[], 
        'from_id':[], 
        'to_id':[], 
        'transport':[],
        'transport_id':[], 
        'from_airport':[], 
        'to_airport':[],
        'from_airport_id':[], 
        'to_airport_id':[],
        'price_EUR':[],
        #'currency':[], # this key is may be unnecessary
        'price_local':[], 'currency_local':[],
        'distance_km':[], 
        'duration_min':[]}

# transport codes manually set up
transport_types = ['fly', 'flight', 'bus', 'train', 'nighttrain', 'drive', 'car', 'taxi', 'walk', 'towncar', 
                'rideshare', 'shuttle', 'carferry']
transport_id = {'fly': 1, 'flight': 1, 'bus': 2, 'train': 3, 'nighttrain': 3, 'drive': 4, 'car': 4, 'taxi': 5, 'walk': 6, 'towncar': 7, 
                'rideshare': 8, 'shuttle': 9, 'carferry': 10}

# extraction all direct routes from all pathes and filling the main data dictionary
for path_id, path in enumerate(pathes):
    for route in path[8][:-1]:
        #if route[1] not in transport_types:
            
             
        if route[0] in (transport_types[:2]): # for fly and flights only
            data['path_id'].append(path_id)
            data['path_name'].append(path[4])
            data['from_node'].append(route[2][1])
            data['to_node'].append(route[3][1])
            data['from_id'].append(get_bb_id(route[2][2:4]))
            data['to_id'].append(get_bb_id(route[3][2:4]))
            data['transport'].append(route[0])
            data['transport_id'].append(transport_id[route[0]])
            data['from_airport'].append(route[2][0])
            data['to_airport'].append(route[3][0])
            data['from_airport_id'].append(get_airport_id(route[2][0]))
            data['to_airport_id'].append(get_airport_id(route[3][0]))
            if route[11][0][1] not in (DEFAULT_CUR, ''):
                price_EUR = cc.convert(route[11][0][0], route[11][0][1], DEFAULT_CUR)
                data['price_EUR'].append(round(price_EUR))
            else:
                data['price_EUR'].append(route[11][0][0])
            #data['currency'].append(DEFAULT_CUR)
            data['price_local'].append('')
            data['currency_local'].append('')
            data['distance_km'].append('')
            data['duration_min'].append(int(route[4] / 60)) # sec to min
            
        elif route[1] in transport_types[2:]: # for other types of vehicles
            data['path_id'].append(path_id)
            data['path_name'].append(path[4])
            data['from_node'].append(route[6][1])
            data['to_node'].append(route[7][1])
            data['from_id'].append(get_bb_id(route[6][2:4]))
            data['to_id'].append(get_bb_id(route[7][2:4]))
            data['transport'].append(route[1])
            data['transport_id'].append(transport_id[route[1]])
            data['from_airport'].append('')
            data['to_airport'].append('')
            data['from_airport_id'].append(get_airport_id_for_loc(route[6][1]))
            data['to_airport_id'].append(get_airport_id_for_loc(route[7][1]))
            if route[13][0][1] not in (DEFAULT_CUR, ''):
                price_EUR = cc.convert(route[13][0][0], route[13][0][1], DEFAULT_CUR)
                if price_EUR <= 1.0: price_EUR == 1.0
                data['price_EUR'].append(round(price_EUR))
            else:
                data['price_EUR'].append(route[13][0][0])
            #data['currency'].append(DEFAULT_CUR)           # may be unnecessary
            data['price_local'].append(route[14][0][0])
            data['currency_local'].append(route[14][0][1])
            data['distance_km'].append(round(route[5]))
            data['duration_min'].append(round(route[3] / 60)) # sec to min
                        
tmp_df = pd.DataFrame(data)

# uncomment line below to make output in folder "../files/output/csv_output/"
tmp_df.to_csv(csv_output_file)
tmp_df.to_json(json_output_file)

print(tmp_df)

    path_id                                       path_name  \
0         0                      Train to Oslo, fly to Rome   
1         0                      Train to Oslo, fly to Rome   
2         0                      Train to Oslo, fly to Rome   
3         1           Train to Oslo Sandefjord, fly to Rome   
4         1           Train to Oslo Sandefjord, fly to Rome   
5         1           Train to Oslo Sandefjord, fly to Rome   
6         1           Train to Oslo Sandefjord, fly to Rome   
7         2  Train to Oslo Sandefjord, fly to Rome Ciampino   
8         2  Train to Oslo Sandefjord, fly to Rome Ciampino   
9         2  Train to Oslo Sandefjord, fly to Rome Ciampino   
10        2  Train to Oslo Sandefjord, fly to Rome Ciampino   
11        2  Train to Oslo Sandefjord, fly to Rome Ciampino   
12        3             Train to Oslo, fly to Naples, train   
13        3             Train to Oslo, fly to Naples, train   
14        3             Train to Oslo, fly to Naples, t