In [135]:
import re
import os
from bs4 import BeautifulSoup
import csv

In [136]:
DIRECTORY_PATH = './pages'

In [137]:
class Node:
    def __init__(self, code: str, uid: int, name: str, country: str = None):
        self.uid = uid
        self.name = name
        self.code = code
        self.country = country

# class Edge:
#     def __init__(self, start: Node, end: Node):
#         self.start = start
#         self.end = end

In [138]:
def get_codes(link):
    return tuple((link[-7:]).split('/'))

def get_airport_names(text):
    re_search = re.search('Flights from (.+) to (.+)', text)
    return re_search.group(1).strip(), re_search.group(2).strip()

In [139]:
print(get_codes('https://wizzair.com/en-gb/flights/timetable/LTN/TIA'))
print(get_airport_names('Flights from London Luton to Tirana'))

('LTN', 'TIA')
('London Luton', 'Tirana')


In [140]:
Nodes = {}
Edges = []
uid = 0

def get_all_incoming_flights_for_airport(a_tags):
    flights = []
    for a_tag in a_tags:
            content = a_tag.contents[0].strip().replace("\n", "")
            link = a_tag['href'].strip()
            if "Flights from" in content:
                codes = get_codes(link)
                airport_names = get_airport_names(content)
                flights.append((codes, airport_names))
    return flights

def get_all_incoming_flights_for_country(text: str):
    soup = BeautifulSoup(text, 'html.parser')
    all_incoming_flights = []
    all_accordions = soup.find_all('div', {'class': 'accordion-item__body'})
    a_tags_by_accordion = map(lambda accordion: (accordion.find_all('a')), all_accordions)
    for accordion_a_tags in a_tags_by_accordion:
        all_incoming_flights.extend(get_all_incoming_flights_for_airport(accordion_a_tags))
    return all_incoming_flights

def update_nodes_and_edges(country, flights):
    global uid
    for codes, airport_names in flights:
        code_from, code_to = codes
        airport_from, airport_to = airport_names
        
        if code_from in Nodes:
            node_from = Nodes.get(code_from)
        else:
            node_from = Node(code_from, uid, airport_from)
            Nodes[code_from] = node_from
            uid += 1
        
        if code_to in Nodes:
            node_to = Nodes.get(code_to)
            node_to.country = country
        else:
            node_to = Node(code_to, uid, airport_to, country)
            Nodes[code_to] = node_to
            uid += 1
            
        Edges.append((node_from.uid, node_to.uid))
        
for filename in os.listdir(DIRECTORY_PATH):
    if "Main" not in filename and "html" in filename:
        with open(os.path.join(DIRECTORY_PATH, filename), 'r') as f:
            text_for_country = f.read()
            flights_for_country = get_all_incoming_flights_for_country(text_for_country)
            country_name = filename[:-5].replace('_', ' ')
            print(country_name)
            update_nodes_and_edges(country_name, flights_for_country)
            
print("Nodes:", len(Nodes))
print("Edges:", len(Edges))    
    
    

Hungary
Albania
Armenia
Austria
Azerbaijan
Belgium
Bosnia and Herzegovina
Bulgaria
Croatia
Cyprus
Czech Republic
Denmark
Estonia
Finland
France
Georgia
Germany
Greece
Iceland
Israel
Italy
Kazakhstan
Kosovo
Latvia
Lithuania
Malta
Moldova
Montenegro
Morocco
Netherlands
North Macedonia
Norway
Poland
Portugal
Romania
Russia
Serbia
Slovakia
Slovenia
Spain
Sweden
Switzerland
Ukraine
United Arab Emirates
United Kingdom
Nodes: 156
Edges: 1501


In [141]:
with open('nodes.csv', 'w') as f:
    f.write("uid, code, name, country\n")
    for node in Nodes.values():
        f.write(f"{node.uid}, {node.code}, {node.name}, {node.country}\n")

with open('edges.csv', 'w') as f:
    f.write("source, target\n")
    for source, target in Edges:
        f.write(f"{source}, {target}\n")
        

In [142]:
suma = 0
for i, (edge0_0, edge0_1) in enumerate(Edges):
    for j in range(i, len(Edges)):
        edge1_0, edge1_1 = Edges[j]
        if edge0_0 == edge1_1 and edge0_1 == edge1_0:
            suma +=1
print(suma) # = 744, which means that there are 13 airports from which we cannot go back (or data error)

744
