## Get coordinates for selected cities

In [None]:
import pandas as pd

coordinates_file = 'data/coordinates.csv'

def get_coordinates(city, country, username='v_kochk'):
    url = f'http://api.geonames.org/searchJSON?q={city}&country={country}&maxRows=1&username={username}'
    response = requests.get(url)
    data = response.json()
    if data['geonames']:
        city_data = data['geonames'][0]
        return city_data['lat'], city_data['lng']
    return None

sel_cities = [
    ('Vienna', 'AT'),
    ('Brussels', 'BE'),
    ('Sofia', 'BG'),
    ('Zagreb', 'HR'),
    ('Prague', 'CZ'),
    ('Copenhagen', 'DK'),
    ('Tallinn', 'EE'),
    ('Paris', 'FR'),
    ('Berlin', 'DE'),
    ('Dresden', 'DE'),
    ('Munich', 'DE'),
    ('Budapest', 'HU'),
    ('Rome', 'IT'),
    ('Riga', 'LV'),
    ('Vilnius', 'LT'),
    ('Luxembourg', 'LU'),
    ('Amsterdam', 'NL'),
    ('Oslo', 'NO'),
    ('Warsaw', 'PL'),
    ('Lisbon', 'PT'),
    ('Bucharest', 'RO'),
    ('Ljubljana', 'SI'),
    ('Bratislava', 'SK'),
    ('Stockholm', 'SE'),
    ('Madrid', 'ES'),
    ('Bilbao', 'ES'),
    ('Bern', 'CH'),
    ('Istanbul', 'TR'),
    ('London', 'GB')
]

# Fetch coordinates for each city
sel_cities_coordinates = []
for city, country in sel_cities:
    coordinates = get_coordinates(city, country)
    if coordinates:
        sel_cities_coordinates.append((city, coordinates[0], coordinates[1]))

# Convert to DataFrame 
sel_cities_df = pd.DataFrame(sel_cities_coordinates, columns=['city', 'latitude', 'longitude'])
# Save DataFrame to CSV
sel_cities_df.to_csv(coordinates_file, index=False)
print(sel_cities_df)

## Load map in Vega-Altair

In [None]:
import altair as alt

#load topojson of Europe
europe = alt.topo_feature('https://dmws.hkvservices.nl/dataportal/data.asmx/read?database=vega&key=europe', 'europe')

#create the map
base = alt.Chart(europe).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project(
    'mercator'
).properties(
    width=1500,
    height=1000
)

#add cities
points = alt.Chart(cities_df).mark_circle(color='#fc9272', size=50).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    tooltip=['city:N']
)

#combine basemap and cities
map_with_cities = base + points
map_with_cities

## Plot selected cities on Vega-Altair Map

In [None]:
import altair as alt

# Define the bounding box (extent) - northernmost and southernmost coordinates
north_lat = 60.0
south_lat = 34.0
west_lon = -5.0
east_lon = 50.0

# Create a GeoJSON-like Feature for the extent
extent_feature = {
    "type": "Feature",
    "geometry": {
        "type": "Polygon",
        "coordinates": [[
            [east_lon, north_lat],
            [east_lon, south_lat],
            [west_lon, south_lat],
            [west_lon, north_lat],
            [east_lon, north_lat]
        ]]
    },
    "properties": {}
}

#load topojson of Europe
europe = alt.topo_feature('https://raw.githubusercontent.com/leakyMirror/map-of-europe/refs/heads/master/TopoJSON/europe.topojson', 'europe')

#create the map
base = alt.Chart(europe).mark_geoshape(clip=True, fill='lightgray', stroke='white').project(
    type='mercator',
    fit=extent_feature  # Use defined extent
).properties(
    width=1500,
    height=700,
)

#add cities
points = alt.Chart(sel_cities_df).mark_circle(color='#fc9272', size=50).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    tooltip=['city:N']
)

#combine basemap and cities
map_with_cities = base + points
map_with_cities

## Alternative - Plotly Map

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scattergeo(
    lon = cities_df['longitude'],
    lat = cities_df['latitude'],
    text = cities_df['city'],
    mode = 'markers',
    marker = dict(
        size = 8,
        color = '#fc9272',
        symbol = 'circle'
    ),
    hovertemplate=(
        '<b>%{text}</b><br>' +
        'Lat: %{lat:.2f} Lon: %{lon:.2f}<br>' +
        '<extra></extra>'  #hide trace name
    ),
     hoverlabel=dict(
        bgcolor="white",
        bordercolor="white",
        font_size=12,     
        font_family="Open Sans",
        font_color="black"
        
    )
))

fig.update_layout(
    title = 'Train Map of Europe',
    geo = dict(
        scope = 'europe',
        projection_type = 'mercator',
        showland = True,
        landcolor = '#bdbdbd',
        showcoastlines = False,
        showlakes = False,
        countrywidth = 0.5,
        countrycolor = 'white',
        lonaxis=dict(
            range=[-11.0, 32.0]
        ),
        lataxis=dict(
            range=[36.0, 60.0]
        ),
        resolution=50
    ),
    width=1000,
    height=800
)

fig.show()

## Get CO2 emissions data and train travel time from [TravelCO2 API](https://travelco2.com/documentation)

In [None]:
import requests

url = "https://travelco2.com/api/v1/simpletrips"

payload = {
    "from": "Berlin, Germany",
    "to": "Amsterdam, Netherlands",
    "ways": 1,
    "people": 1,
    "language": "en",
    "title": "Comparing flying and public transport from Berlin to Amsterdam.",
    "transport_types": ["flying", "public-transport"]
}
headers = {
    "Content-Type": "application/json",
    "Accept": "application/json",
    "Authorization": "Bearer YOUR_APIKEY"
}

response = requests.post(url, json=payload, headers=headers)

print(response.json())

In [None]:
#Version with manual skipping and stopping

import requests
import os
import pandas as pd
import json
from datetime import datetime
import sys
import keyboard
import time

# File paths
log_directory = 'api_logs'
coordinates_file = 'data/coordinates.csv'

# Generate a unique log file name based on the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_directory, f'api_responses_{timestamp}.log')

# Ensure the log directory exists
os.makedirs(log_directory, exist_ok=True)

# Read cities from the CSV file
coordinates_df = pd.read_csv(coordinates_file)
cities = coordinates_df['city'].tolist()

# Function to make API call and process data
def get_trip_data(start, end):
    url = "https://travelco2.com/api/v1/simpletrips"
    payload = {
        "from": start,
        "to": end,
        "ways": 1,
        "people": 1,
        "language": "en",
        "title": f"Comparing flying and public transport from {start} to {end}.",
        "transport_types": ["flying", "public-transport"]
    }
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
        "Authorization": "Bearer YOUR_APIKEY"
    }

    response = requests.post(url, json=payload, headers=headers)
    data = response.json()
    
    # Log the response to a file
    with open(log_file, 'a') as f:
        f.write(json.dumps(data, indent=4))
        f.write('\n')

# Function to handle user actions
def handle_action(request_counter, start, end):
    print(f"API request {request_counter}: {start} to {end}.")
    print("Press Shift to Skip, Enter to Proceed, Esc to Stop")

    while True:
        if keyboard.is_pressed('esc'):
            print("Stopping script...")
            sys.exit()

        if keyboard.is_pressed('shift'):
            time.sleep(0.3)  # Wait to avoid multiple detections
            print("Skipping request...")
            return 'skip'

        if keyboard.is_pressed('enter'):
            time.sleep(0.3)  # Wait to avoid multiple detections
            print("Proceeding with request...")
            return 'proceed'

        time.sleep(0.1)  # Short sleep to prevent high CPU usage

# Loop through the cities and make API requests
processed_pairs = set()
request_counter = 0
for i in range(len(cities)):
    for j in range(i + 1, len(cities)):
        if (cities[i], cities[j]) not in processed_pairs and (cities[j], cities[i]) not in processed_pairs:
            action = handle_action(request_counter + 1, cities[i], cities[j])
            if action == 'proceed':
                get_trip_data(cities[i], cities[j])
                request_counter += 1
                print(f"API request {request_counter} completed: {cities[i]} to {cities[j]}")
                processed_pairs.add((cities[i], cities[j]))
            elif action == 'skip':
                processed_pairs.add((cities[i], cities[j]))
        else:
            print(f"Request from {cities[i]} to {cities[j]} has already been processed.")
            
print("All API requests have been completed and logs have been stored.")

In [None]:
#Version with skipping defined number of requests

import requests
import os
import pandas as pd
import json
import time
from datetime import datetime
import sys

# File paths
log_directory = 'api_logs'
coordinates_file = 'data/coordinates.csv'

# Generate a unique log file name based on the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_directory, f'api_responses_{timestamp}.log')

# Ensure the log directory exists
os.makedirs(log_directory, exist_ok=True)

# Read cities from the CSV file
coordinates_df = pd.read_csv(coordinates_file)
cities = coordinates_df['city'].tolist()

# Function to make API call and process data
def get_trip_data(start, end):
    url = "https://travelco2.com/api/v1/simpletrips"
    payload = {
        "from": start,
        "to": end,
        "ways": 1,
        "people": 1,
        "language": "en",
        "title": f"Comparing flying and public transport from {start} to {end}.",
        "transport_types": ["flying", "public-transport"]
    }
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
        "Authorization": "Bearer YOUR_APIKEY"
    }

    response = requests.post(url, json=payload, headers=headers)
    data = response.json()
    
    # Log the response to a file
    with open(log_file, 'a') as f:
        f.write(json.dumps(data, indent=4))
        f.write('\n')
        
    # Check if the response was successful
    if not data.get("success", True):
        print(f"API request failed for {start} to {end}. Stopping script.")
        sys.exit()

# Loop through the cities and make API requests
processed_pairs = set()
request_counter = 0  # Counter to see the number of requests
skip_count = 361     # Number of requests to skip initially

for i in range(len(cities)):
    for j in range(i + 1, len(cities)):
        if (cities[i], cities[j]) not in processed_pairs and (cities[j], cities[i]) not in processed_pairs:
            if request_counter < skip_count:
                print(f"Skipping API request {request_counter + 1}: {cities[i]} to {cities[j]}")
                request_counter += 1
                processed_pairs.add((cities[i], cities[j]))
                continue
            
            print(f"Proceeding with API request {request_counter + 1}: {cities[i]} to {cities[j]}")
            get_trip_data(cities[i], cities[j])
            request_counter += 1
            print(f"API request {request_counter} completed: {cities[i]} to {cities[j]}")
            processed_pairs.add((cities[i], cities[j]))
            
            # Adding a 0.5-second pause between requests
            time.sleep(0.5)
        else:
            print(f"Request from {cities[i]} to {cities[j]} has already been processed.")

print("All API requests have been completed and logs have been stored.")

In [None]:
#version for getting missing/extra pairs of cities
import requests
import os
import json
from datetime import datetime
import sys
import time  # Importing time module to add a pause

# File paths
log_directory = 'api_logs'

# Generate a unique log file name based on the current timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(log_directory, f'api_responses_{timestamp}.log')

# Ensure the log directory exists
os.makedirs(log_directory, exist_ok=True)

# Predefined list of city pairs for API requests
city_pairs = [
    ("Luxembourg City", "Vienna"),
    ("Luxembourg City", "Brussels"),
    ("Luxembourg City", "Sofia"),
    ("Luxembourg City", "Zagreb"),
    ("Luxembourg City", "Prague"),
    ("Luxembourg City", "Copenhagen"),
    ("Luxembourg City", "Tallinn"),
    ("Luxembourg City", "Paris"),
    ("Luxembourg City", "Berlin"),
    ("Luxembourg City", "Dresden"),
    ("Luxembourg City", "Munich"),
    ("Luxembourg City", "Budapest"),
    ("Luxembourg City", "Rome"),
    ("Luxembourg City", "Riga"),
    ("Luxembourg City", "Vilnius"),
    ("Luxembourg City", "Amsterdam"),
    ("Luxembourg City", "Oslo"),
    ("Luxembourg City", "Warsaw"),
    ("Luxembourg City", "Lisbon"),
    ("Luxembourg City", "Bucharest"),
    ("Luxembourg City", "Ljubljana"),
    ("Luxembourg City", "Bratislava"),
    ("Luxembourg City", "Stockholm"),
    ("Luxembourg City", "Madrid"),
    ("Luxembourg City", "Bilbao"),
    ("Luxembourg City", "Bern"),
    ("Luxembourg City", "Istanbul"),
    ("Luxembourg City", "London"),
    ("London", "Tallinn"),
    ("Sofia", "Ljubljana"),
    ("Munich", "Prague"),
    ("Amsterdam", "Sofia"),
    ("Dresden", "Frankfurt"),
    ("Frankfurt", "Paris"),
    ("Istanbul", "Plovdiv"),
    ("Plovdiv", "Sofia")
]

# Function to make API call and process data
def get_trip_data(start, end):
    url = "https://travelco2.com/api/v1/simpletrips"
    payload = {
        "from": start,
        "to": end,
        "ways": 1,
        "people": 1,
        "language": "en",
        "title": f"Comparing flying and public transport from {start} to {end}.",
        "transport_types": ["flying", "public-transport"]
    }
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
        "Authorization": "Bearer YOUR_APIKEY"
    }

    response = requests.post(url, json=payload, headers=headers)
    data = response.json()
    
    # Log the response to a file
    with open(log_file, 'a') as f:
        f.write(json.dumps(data, indent=4))
        f.write('\n')
    
    # Check if the response was successful
    if not data.get("success", True):
        print(f"API request failed for {start} to {end}. Stopping script.")
        sys.exit()

# Loop through the city pairs and make API requests
request_counter = 0  # Counter to see the number of requests

for start, end in city_pairs:
    print(f"Proceeding with API request {request_counter + 1}: {start} to {end}")
    get_trip_data(start, end)
    request_counter += 1
    print(f"API request {request_counter} completed: {start} to {end}")

    # Adding a 0.5-second pause between requests
    time.sleep(0.5)

print("All API requests have been completed and logs have been stored.")

In [None]:
import json
import pandas as pd
import os
import re

# File paths
log_directory = 'api_logs'
# log_file = 'api_logs/api_responses_extra.log'
csv_file = 'data/trips_data_1508_edit1.csv'
log_files = [os.path.join(log_directory, f) for f in os.listdir(log_directory) if f.endswith('.log')]

# Function to extract data from JSON text
def extract_data_from_text(text):
    try:
        title_match = re.search(r'"title": "Comparing flying and public transport from (.+?) to (.+?)\."', text)
        if not title_match:
            raise ValueError("Title format does not match")
        city1 = title_match.group(1)
        city2 = title_match.group(2).rstrip(".")
        
        json_data = json.loads(text)
        trips = json_data.get("trips", [])
        
        if not trips:
            raise ValueError("No trip data found")
        # trips = json_data["trips"]
        # if len(trips) < 2:
        #     raise ValueError("Not enough trips data")

        co2_plane = round(trips[0]["co2e"], 2)
        co2_train = round(trips[1]["co2e"], 2)

        duration_seconds = sum(step["transport"]["duration"] for step in trips[1]["steps"] if step.get("transport") and step["transport"].get("duration"))
        
        # Converting duration to h:mm format
        hours, remainder = divmod(duration_seconds, 3600)
        minutes = remainder // 60
        duration_train = f"{int(hours)}:{int(minutes):02d}"
        
        # train_less_co2_to_plane = round((co2_plane - co2_train) / co2_plane * 100, 2)
        # plane_more_co2_to_train = round((co2_plane / co2_train), 2)

        return city1, city2, duration_train, co2_train, co2_plane #train_less_co2_to_plane, plane_more_co2_to_train
    except (IndexError, AttributeError, ValueError) as e:
        print(f"Error extracting data from text: {e}")
        return None, None, None, None, None

# Check if the CSV file exists and load it, otherwise initialize an empty DataFrame
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
else:
    df = pd.DataFrame(columns=["City_1", "City_2", "Duration_train", "Train_CO2_kg", "Plane_CO2_kg"])

# Process each log file
for log_file in log_files:
    with open(log_file, 'r') as f:
        content = f.read()
        entries = content.split('}\n{')
        
        # Adjust entries to ensure they are valid JSON strings
        entries = [entry + '}' if not entry.endswith('}') else entry for entry in entries]
        entries = ['{' + entry if not entry.startswith('{') else entry for entry in entries]
    
        for entry in entries:
            if entry.strip():  # Ensure it's not an empty line
                city1, city2, duration_train, co2_train, co2_plane = extract_data_from_text(entry)
                if city1 and city2:
                    df = df.append({
                        "City_1": city1,
                        "City_2": city2,
                        "Duration_train": duration_train,
                        "Train_CO2_kg": co2_train,
                        "Plane_CO2_kg": co2_plane,
                    }, ignore_index=True)

# Save DataFrame to CSV
df.to_csv(csv_file, index=False)

print("Data has been extracted and saved successfully.")

In [None]:
#one log file, when no train data

import json
import pandas as pd
import os
import re

# File paths
log_file = 'api_logs/api_responses_extra.log'
csv_file = 'data/trips_data_1508_edit1.csv'

# Function to extract data from JSON text
def extract_data_from_text(text):
    try:
        title_match = re.search(r'"title": "Comparing flying and public transport from (.+?) to (.+?)\."', text)
        if not title_match:
            raise ValueError("Title format does not match")
        city1 = title_match.group(1)
        city2 = title_match.group(2).rstrip(".")
        
        json_data = json.loads(text)
        trips = json_data.get("trips", [])
        
        co2_plane, co2_train, duration_train = None, None, None

        if trips:
            co2_plane = round(trips[0]["co2e"], 2)

            if len(trips) > 1 and trips[1].get("steps"):
                co2_train = round(trips[1]["co2e"], 2)
                duration_seconds = sum(
                    step["transport"]["duration"] for step in trips[1]["steps"]
                    if step.get("transport") and step["transport"].get("duration")
                )

                # Convert duration to h:mm format
                hours, remainder = divmod(duration_seconds, 3600)
                minutes = remainder // 60
                duration_train = f"{int(hours)}:{int(minutes):02d}"

        return city1, city2, duration_train, co2_train, co2_plane
    except (IndexError, AttributeError, ValueError, json.JSONDecodeError) as e:
        print(f"Error extracting data from text: {e}")
        return None, None, None, None, None
        
# Check if the CSV file exists and load it, otherwise initialize an empty DataFrame
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
else:
    df = pd.DataFrame(columns=["City_1", "City_2", "Duration_train", "Train_CO2_kg", "Plane_CO2_kg"])

# Process each log file
with open(log_file, 'r') as f:
    content = f.read()
    entries = content.split('}\n{')
    
    # Adjust entries to ensure they are valid JSON strings
    entries = [entry + '}' if not entry.endswith('}') else entry for entry in entries]
    entries = ['{' + entry if not entry.startswith('{') else entry for entry in entries]

    for entry in entries:
        if entry.strip():  # Ensure it's not an empty line
            city1, city2, duration_train, co2_train, co2_plane = extract_data_from_text(entry)
            if city1 and city2:
                df = df.append({
                    "City_1": city1,
                    "City_2": city2,
                    "Duration_train": duration_train,
                    "Train_CO2_kg": co2_train,
                    "Plane_CO2_kg": co2_plane,
                }, ignore_index=True)

# Save DataFrame to CSV
df.to_csv(csv_file, index=False)

print("Data has been extracted and saved successfully.")

In [None]:
#for one log file, CO2 emissions from plane only

import json
import pandas as pd
import os
import re

# File path
log_file = 'api_logs/misc/found.txt'
csv_file = 'data/trips_data_1508_edit1.csv'

# Function to extract Plane_CO2_kg from JSON text
def extract_co2_from_text(text):
    try:
        title_match = re.search(r'"title": "Comparing flying and public transport from (.+?) to (.+?)\."', text)
        if not title_match:
            raise ValueError("Title format does not match")
        city1 = title_match.group(1)
        city2 = title_match.group(2).rstrip(".")
        
        json_data = json.loads(text)
        trips = json_data.get("trips", [])
        if not trips:
            raise ValueError("No trip data found")
        
        # Extracting CO2 emissions for the plane (first trip in the JSON structure)
        co2_plane = round(trips[0].get("co2e", 0), 2)
        
        return city1, city2, co2_plane
    except (json.JSONDecodeError, IndexError, ValueError) as e:
        print(f"Error extracting data from text: {e}")
        return None, None, None

# Load existing CSV file
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
else:
    df = pd.DataFrame(columns=[
        "City_1",
        "City_2",
        "Duration_train",
        "Train_CO2_kg",
        "Plane_CO2_kg"
    ])
# Process the log file
with open(log_file, 'r') as f:
    content = f.read()
    entries = content.split('}\n{')
    
    # Adjust entries to ensure they are valid JSON strings
    entries = [entry + '}' if not entry.endswith('}') else entry for entry in entries]
    entries = ['{' + entry if not entry.startswith('{') else entry for entry in entries]

    for entry in entries:
        if entry.strip():  # Ensure it's not an empty line
            city1, city2, co2_plane = extract_co2_from_text(entry)
            if city1 and city2:
                df = df.append({
                    "City_1": city1,
                    "City_2": city2,
                    "Duration_train": "",
                    "Train_CO2_kg": "",
                    "Plane_CO2_kg": co2_plane
                }, ignore_index=True)

# Save DataFrame to CSV
df.to_csv(csv_file, index=False)

print("Data has been extracted and appended to the CSV file successfully.")

In [None]:
#calculate missing values for train routes based on existing data
import pandas as pd
import os

# File path
csv_file = 'data/trips_data_1508_edit1.csv'

# Function to convert duration in h:mm format to total minutes
def duration_to_minutes(duration_str):
    if pd.isna(duration_str) or duration_str == "":
        return None
    hours, minutes = map(int, duration_str.split(':'))
    return hours * 60 + minutes

# Function to convert total minutes to h:mm format
def minutes_to_duration(minutes):
    if minutes is None:
        return ""
    hours = minutes // 60
    minutes = minutes % 60
    return f"{hours}:{minutes:02d}"

# Load existing CSV file
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
else:
    raise FileNotFoundError(f"The file {csv_file} does not exist.")

# Define the calculation logic for routes
routes_map = {
    "Luxembourg City,Tallinn": [("Luxembourg City", "Berlin"), ("Berlin", "Tallinn")],
    "Dresden,Paris": [("Dresden", "Frankfurt"), ("Frankfurt", "Paris")],
    "Istanbul,Sofia": [("Istanbul", "Plovdiv"), ("Plovdiv", "Sofia")],
    "Bern,Lisbon": [("Bern", "Madrid"), ("Madrid", "Lisbon")],
    "Bern,Tallinn": [("Bern", "Vilnius"), ("Vilnius", "Tallinn")],
    "Bilbao,Oslo": [("Bilbao", "Brussels"), ("Brussels", "Oslo")],
    "Bilbao,Riga": [("Bilbao", "Vilnius"), ("Vilnius", "Riga")],
    "Bilbao,Rome": [("Bilbao", "Paris"), ("Paris", "Rome")],
    "Bilbao,Stockholm": [("Bilbao", "Paris"), ("Paris", "Stockholm")],
    "Bilbao,Tallinn": [("Bilbao", "Vilnius"), ("Vilnius", "Tallinn")],
    "Brussels,Riga": [("Brussels", "Warsaw"), ("Warsaw", "Vilnius"), ("Vilnius", "Riga")],
    "Dresden,Istanbul": [("Dresden", "Prague"), ("Prague", "Istanbul")],
    "Istanbul,Lisbon": [("Istanbul", "Madrid"), ("Madrid", "Lisbon")],
    "Istanbul,Riga": [("Istanbul", "Bucharest"), ("Bucharest", "Riga")],
    "Istanbul,Tallinn": [("Istanbul", "Bucharest"), ("Bucharest", "Tallinn")],
    "Istanbul,Vilnius": [("Istanbul", "Bucharest"), ("Bucharest", "Vilnius")],
    "Istanbul,Warsaw": [("Istanbul", "Bucharest"), ("Bucharest", "Warsaw")],
    "Lisbon,Oslo": [("Lisbon", "Madrid"), ("Madrid", "Oslo")],
    "London,Vilnius": [("London", "Berlin"), ("Berlin", "Vilnius")],
    "London,Warsaw": [("London", "Berlin"), ("Berlin", "Warsaw")],
    "Oslo,Sofia": [("Oslo", "Berlin"), ("Berlin", "Sofia")],
    "Rome,Sofia": [("Rome", "Bucharest"), ("Bucharest", "Sofia")],
    "Tallinn,Warsaw": [("Tallinn", "Vilnius"), ("Vilnius", "Warsaw")],
    "Tallinn,Vienna": [("Tallinn", "Vilnius"), ("Vilnius", "Warsaw"), ("Warsaw", "Vienna")],
    "Tallinn,Zagreb": [("Tallinn", "Vilnius"), ("Vilnius", "Warsaw"), ("Warsaw", "Zagreb")]
}

# Function to calculate missing values
def calculate_missing_values(row):
    route = f"{row['City_1']},{row['City_2']}"
    if route in routes_map:
        sub_routes = routes_map[route]
        total_duration_minutes = 0
        total_co2_kg = 0.0
        for city1, city2 in sub_routes:
            sub_route = df[((df['City_1'] == city1) & (df['City_2'] == city2)) | 
                           ((df['City_1'] == city2) & (df['City_2'] == city1))]
            if not sub_route.empty:
                duration = duration_to_minutes(sub_route.iloc[0]['Duration_train'])
                co2_kg = sub_route.iloc[0]['Train_CO2_kg']
                total_duration_minutes += duration if duration is not None else 0
                total_co2_kg += co2_kg if co2_kg is not None else 0

        # Update the DataFrame
        df.at[idx, 'Duration_train'] = minutes_to_duration(total_duration_minutes)
        df.at[idx, 'Train_CO2_kg'] = round(total_co2_kg, 2)

# Apply the function to calculate missing values
for idx, row in df.iterrows():
    if pd.isna(row['Duration_train']) or row['Duration_train'] == "" or pd.isna(row['Train_CO2_kg']) or row['Train_CO2_kg'] == "":
        calculate_missing_values(row)

# Save the updated DataFrame to CSV
df.to_csv(csv_file, index=False)

print("Missing values have been calculated and the CSV file has been updated successfully.")

## Get flight duration from [AeroDataBox API](https://doc.aerodatabox.com/index.html?urls.primaryName=API.Market)

In [None]:
#add airport codes
import pandas as pd

# File path
csv_file = 'data/trips_data.csv'

# Dictionary mapping cities to their airport codes
airport_codes = {
    "Amsterdam": "AMS",
    "Berlin": "BER",
    "Bern": "BRN",
    "Bilbao": "BIO",
    "Bratislava": "BTS",
    "Brussels": "BRU",
    "Bucharest": "OTP",
    "Budapest": "BUD",
    "Copenhagen": "CPH",
    "Dresden": "DRS",
    "Istanbul": "IST",
    "Lisbon": "LIS",
    "Ljubljana": "LJU",
    "London": "LHR",
    "Luxembourg City": "LUX",
    "Madrid": "MAD",
    "Munich": "MUC",
    "Oslo": "OSL",
    "Paris": "CDG",
    "Prague": "PRG",
    "Riga": "RIX",
    "Rome": "FCO",
    "Sofia": "SOF",
    "Stockholm": "ARN",
    "Tallinn": "TLL",
    "Vienna": "VIE",
    "Vilnius": "VNO",
    "Warsaw": "WAW",
    "Zagreb": "ZAG"
}

# Load existing CSV file
df = pd.read_csv(csv_file)

# Function to get airport code for a city
def get_airport_code(city):
    return airport_codes.get(city, "")

# Create the AIR_1 and AIR_2 columns
df['AIR_1'] = df['City_1'].apply(get_airport_code)
df['AIR_2'] = df['City_2'].apply(get_airport_code)

# Reorder the columns to insert AIR_1 and AIR_2 after City_2
columns_order = ['ID', 'City_1', 'City_2', 'AIR_1', 'AIR_2', 'Duration_train', 'Train_CO2_kg', 'Plane_CO2_kg','Duration_plane']
df = df[columns_order]

# Save the updated DataFrame to CSV
df.to_csv(csv_file, index=False)

print("AIR_1 and AIR_2 columns have been added and the CSV file has been updated successfully.")

In [None]:
#Add flight duration data from API
import pandas as pd
import requests
import os
import datetime
import time

# Define the API key and the base URL
API_KEY = 'Your API key' #https://api.market/store/aedbx/aerodatabox

# Create the flights_API directory if it doesn't exist
log_dir = 'flights_API'
os.makedirs(log_dir, exist_ok=True)

# Define the path for the merged log file
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
merged_log_file = os.path.join(log_dir, f'API_log_{timestamp}.txt')

# Load the CSV file
file_path = 'data/trips_data.csv'
df = pd.read_csv(file_path)

# Function to fetch flight duration from the API
def get_flight_duration(airport_from, airport_to):
    url = f'https://api.magicapi.dev/api/v1/aedbx/aerodatabox/airports/Iata/{airport_from}/distance-time/{airport_to}?flightTimeModel=ML01'
    headers = {
        'accept': 'application/json',
        'x-magicapi-key': API_KEY
    }

    response = requests.get(url, headers=headers)

    # Print the raw response for debugging
    print(f'Response from {url}:\n{response.text}')

    try:
        response_data = response.json()
    except requests.exceptions.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None
    
    # Log the response with a timestamp
    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    
    with open(merged_log_file, 'a', encoding='utf-8') as log:
        log.write(f'Time: {timestamp}\n')
        log.write(f'From: {airport_from} To: {airport_to}\n')
        log.write(f'URL: {url}\n')
        log.write(f'Response: {response.text}\n')
        log.write('\n')

    # Extract the flight duration in "hh:mm" format
    approx_flight_time = response_data.get('approxFlightTime', None)
    
    if approx_flight_time:
        # Convert "hh:mm:ss" to "hh:mm"
        hh_mm = approx_flight_time[:5]
        return hh_mm
    
    return None

# Apply the API call with a 1-second interval between requests
for index, row in df.iterrows():
    df.at[index, 'Duration_plane'] = get_flight_duration(row['AIR_1'], row['AIR_2'])
    time.sleep(1) 

# Save the updated CSV file
df.to_csv(file_path, index=False)

print("Flight durations have been added to the CSV file.")

In [None]:
import pandas as pd
import os
import re

# Define the directory and log file paths
log_dir = 'flights_API'
merged_log_file = os.path.join(log_dir, 'API_log_merged.txt')

# Load the CSV file
file_path = 'data/trips_data.csv'
df = pd.read_csv(file_path)

# Function to parse the log file and extract flight durations
def parse_log_file():
    flight_durations = {}

    with open(merged_log_file, 'r', encoding='utf-8') as log:
        log_content = log.read()
        
        # Find all relevant log entries using regex
        entries = re.findall(r'Time: (.*?)\nFrom: (.*?) To: (.*?)\nURL: .*?\nResponse: (.*?)\n', log_content, re.DOTALL)
        
        for entry in entries:
            timestamp, airport_from, airport_to, response = entry
            
            # Extract the flight duration from the response using regex
            duration_match = re.search(r'"approxFlightTime":"(\d{2}:\d{2}:\d{2})"', response)
            
            if duration_match:
                hh_mm = duration_match.group(1)[:5]  # Convert "hh:mm:ss" to "hh:mm"
                flight_durations[(airport_from, airport_to)] = hh_mm

    return flight_durations

# Parse the log file to get the flight durations
flight_durations = parse_log_file()

# Update the DataFrame with the parsed durations
for index, row in df.iterrows():
    airport_pair = (row['AIR_1'], row['AIR_2'])
    if airport_pair in flight_durations:
        df.at[index, 'Duration_plane'] = flight_durations[airport_pair]

# Save the updated CSV file
df.to_csv(file_path, index=False)

print("Flight durations have been updated in the CSV file from the log.")

In [None]:
import pandas as pd
from datetime import datetime, timedelta

# File path
csv_file = 'data/trips_data.csv'

# Load existing CSV file
df = pd.read_csv(csv_file)

# Function to add hours to a time string
def add_hours_to_duration(duration_str, hours_to_add):
    try:
        # Parse the duration string
        time_obj = datetime.strptime(duration_str, '%H:%M')
        # Add the specified hours
        new_time = time_obj + timedelta(hours=hours_to_add)
        # Format the new time as H:MM
        return new_time.strftime('%H:%M')
    except ValueError as e:
        print(f"Error processing duration '{duration_str}': {e}")
        return ""

# Add 3 hours to each duration in Duration_plane
df['Duration_plane_total'] = df['Duration_plane'].apply(lambda x: add_hours_to_duration(x, 3))

# Save the updated DataFrame to CSV
df.to_csv(csv_file, index=False)

print("Duration_plane_total column has been added and the CSV file has been updated successfully.")

## Creating train routes (lines and points Geojson files) from coordinates of the intermediate stops in the TravelCO2 API request logs

In [None]:
#version for extracting intermediate stops data
import json
import pandas as pd
import os
import re

# File paths
log_file = 'api_logs/api_responses_extra.log'
csv_file = 'data/plovdiv_intermediate_stops_data.csv'

# Function to extract intermediate stop data from JSON text
def extract_stops_from_text(text):
    try:
        title_match = re.search(r'"title": "Comparing flying and public transport from (.+?) to (.+?)\."', text)
        if not title_match:
            raise ValueError("Title format does not match")
        city1 = title_match.group(1)
        city2 = title_match.group(2).rstrip(".")
        
        json_data = json.loads(text)
        trips = json_data.get("trips", [])
        
        # Ensure there are at least two trips
        if len(trips) < 2:
            raise ValueError("Not enough trips data")
        
        steps = trips[1].get("steps", [])
        
        stops = []
        for step in steps:
            if "location" in step:
                location = step["location"]
                stops.append({
                    "placename": location.get("placename", ""),
                    "latitude": location.get("latitude", ""),
                    "longitude": location.get("longitude", "")
                })

        return city1, city2, stops
    except (IndexError, AttributeError, ValueError, json.JSONDecodeError) as e:
        print(f"Error extracting stops data from text: {e}")
        return None, None, []

# Check if the CSV file exists and load it, otherwise initialize an empty DataFrame
if os.path.exists(csv_file):
    df = pd.read_csv(csv_file)
else:
    df = pd.DataFrame(columns=["City_1", "City_2"])

# Process each log file
with open(log_file, 'r') as f:
    content = f.read()
    entries = content.split('}\n{')

    # Adjust entries to ensure they are valid JSON strings
    entries = [entry + '}' if not entry.endswith('}') else entry for entry in entries]
    entries = ['{' + entry if not entry.startswith('{') else entry for entry in entries]

    for entry in entries:
        if entry.strip():  # Ensure it's not an empty line
            city1, city2, stops = extract_stops_from_text(entry)
            if city1 and city2:
                row_data = {"City_1": city1, "City_2": city2}
                
                for i, stop in enumerate(stops, start=1):
                    row_data[f"Stop_{i}"] = stop["placename"]
                    row_data[f"Stop_{i}_lat"] = stop["latitude"]
                    row_data[f"Stop_{i}_lon"] = stop["longitude"]
                
                df = df.append(row_data, ignore_index=True)

# Save DataFrame to CSV
df.to_csv(csv_file, index=False)

print("Intermediate stops data has been extracted and saved successfully.")

In [None]:
#version for extracting intermediate stops data from multiple log files based on existing csv file
import pandas as pd
import json
import re
import os

# File paths
trips_csv_file = 'data/trips_data.csv'
logs_directory = 'api_logs'
output_csv_file = 'data/trips_data_with_stops.csv'

# Load the trips data CSV file
trips_df = pd.read_csv(trips_csv_file)

# Function to extract stop data from JSON text
def extract_stops_from_text(text):
    try:
        # Match the route based on title
        title_match = re.search(r'"title": "Comparing flying and public transport from (.+?) to (.+?)\."', text)
        if not title_match:
            raise ValueError("Title format does not match")
        city1 = title_match.group(1)
        city2 = title_match.group(2).rstrip(".")

        json_data = json.loads(text)
        trips = json_data.get("trips", [])
        
        if len(trips) < 2 or not trips[1].get("steps"):
            return city1, city2, ["MANUAL"]

        stops = []
        for step in trips[1]["steps"]:
            if "location" in step:
                placename = step["location"].get("placename", "Unknown")
                latitude = round(step["location"].get("latitude", None), 7)
                longitude = round(step["location"].get("longitude", None),7)
                if latitude is not None and longitude is not None:
                    stops.append((placename, latitude, longitude))
        
        if not stops:
            stops = ["MANUAL"]
        
        return city1, city2, stops
    except (IndexError, AttributeError, ValueError) as e:
        print(f"Error extracting stop data from text: {e}")
        return None, None, ["MANUAL"]

# Dictionary to store stops data
stops_data = {}

# Process each log file in the logs directory
for log_filename in os.listdir(logs_directory):
    if log_filename.endswith('.log'):
        log_file_path = os.path.join(logs_directory, log_filename)
        
        with open(log_file_path, 'r') as f:
            content = f.read()
            entries = content.split('}\n{')
            
            # Adjust entries to ensure they are valid JSON strings
            entries = [entry + '}' if not entry.endswith('}') else entry for entry in entries]
            entries = ['{' + entry if not entry.startswith('{') else entry for entry in entries]

            # Extract stop data from each JSON entry
            for entry in entries:
                if entry.strip():  # Ensure it's not an empty line
                    city1, city2, stops = extract_stops_from_text(entry)
                    if city1 and city2:
                        stops_data[(city1, city2)] = stops

# Function to merge stop data into the trips DataFrame
def merge_stops_data(row):
    key = (row['City_1'], row['City_2'])
    stops = stops_data.get(key, ["MANUAL"])
    
    for i, stop in enumerate(stops):
        stop_index = i + 1
        if isinstance(stop, tuple):
            row[f'{stop_index}_stop'] = stop[0]
            row[f'{stop_index}_stop_lat'] = stop[1]
            row[f'{stop_index}_stop_lon'] = stop[2]
        else:
            row['1_stop'] = stop  # "MANUAL"
    
    return row

# Apply the merge function to the DataFrame
trips_df = trips_df.apply(merge_stops_data, axis=1)

# Reorder the columns so that stops are in the correct order
stop_columns = sorted([col for col in trips_df.columns if re.match(r'^\d+_stop$', col)], key=lambda x: int(x.split('_')[0]))
stop_lat_columns = sorted([col for col in trips_df.columns if re.match(r'^\d+_stop_lat$', col)], key=lambda x: int(x.split('_')[0]))
stop_lon_columns = sorted([col for col in trips_df.columns if re.match(r'^\d+_stop_lon$', col)], key=lambda x: int(x.split('_')[0]))

# Combine stop name, latitude, and longitude columns in the correct order
stop_columns_ordered = sum([[stop_columns[i], stop_lat_columns[i], stop_lon_columns[i]] for i in range(len(stop_columns))], [])

# Final column order
ordered_columns = ['ID', 'City_1', 'City_2', 'Duration_train', 'Train_CO2_kg', 'Plane_CO2_kg'] + stop_columns_ordered

# Reorder DataFrame columns
trips_df = trips_df[ordered_columns]

# Save the updated DataFrame to a new CSV file
trips_df.to_csv(output_csv_file, index=False)

print(f"Data has been merged and saved to {output_csv_file}.")

In [None]:
#coordinates to lines in GeoJSON
import pandas as pd
import json
import os

# File paths
csv_file = 'data/intermediate_stops_data.csv'
output_dir = 'geojson_files'

# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the CSV file
df = pd.read_csv(csv_file)

# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    city1 = row["City_1"]
    city2 = row["City_2"]
    
    # Prepare coordinates list
    coordinates = []
    
    # Iterate through possible stops (up to 13 stops based on your CSV)
    for i in range(1, 14):  # Stops from 1 to 13
        lat_key = f"Stop_{i}_lat"
        lon_key = f"Stop_{i}_lon"
        
        if pd.notna(row[lat_key]) and pd.notna(row[lon_key]):
            coordinates.append([row[lon_key], row[lat_key]])
    
    if len(coordinates) > 1:
        # Create GeoJSON structure
        geojson_data = {
            "type": "FeatureCollection",
            "features": [
                {
                    "type": "Feature",
                    "geometry": {
                        "type": "LineString",
                        "coordinates": coordinates
                    },
                    "properties": {
                        "City_1": city1,
                        "City_2": city2
                    }
                }
            ]
        }
        
        # Generate filename
        filename = f"{city1.replace(' ', '_')}_{city2.replace(' ', '_')}.geojson"
        filepath = os.path.join(output_dir, filename)
        
        # Save to GeoJSON file
        with open(filepath, 'w') as f:
            json.dump(geojson_data, f, indent=4)
        
        print(f"GeoJSON file saved: {filepath}")
    else:
        print(f"Skipping {city1} to {city2}: Not enough valid coordinates")

print("GeoJSON files generated successfully.")

In [None]:
#lines+points in GeoJSON
import pandas as pd
import json
import os

# File paths
csv_file = 'data/trips_data_with_stops_final.csv'
output_dir = 'geojson_files'

# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the CSV file
df = pd.read_csv(csv_file, sep=';')

# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    city1 = row["City_1"]
    city2 = row["City_2"]
    
    # Prepare coordinates list for LineString
    line_coordinates = []
    
    # Prepare list of Point features
    point_features = []
    
    # Iterate through possible stops (up to 25 stops)
    for i in range(1, 26):  # Stops from 1 to 25
        stop_key = f"{i}_stop"
        lat_key = f"{i}_stop_lat"
        lon_key = f"{i}_stop_lon"
        
        if pd.notna(row[lat_key]) and pd.notna(row[lon_key]):
            # Add to LineString coordinates
            line_coordinates.append([row[lon_key], row[lat_key]])
            
            # Create a point feature
            point_feature = {
                "type": "Feature",
                "geometry": {
                    "type": "Point",
                    "coordinates": [row[lon_key], row[lat_key]]
                },
                "properties": {
                    "Start": city1,
                    "End": city2,
                    "stop_name": row[stop_key],
                    "latitude": row[lat_key],
                    "longitude": row[lon_key]
                }
            }
            point_features.append(point_feature)
    
    if len(line_coordinates) > 1:
        # Create GeoJSON structure
        geojson_data = {
            "type": "FeatureCollection",
            "features": [
                {
                    "type": "Feature",
                    "geometry": {
                        "type": "LineString",
                        "coordinates": line_coordinates
                    },
                    "properties": {
                        "Start": city1,
                        "End": city2
                    }
                }
            ]
        }
        
        # Add point features to GeoJSON
        geojson_data["features"].extend(point_features)
        
        # Generate filename
        filename = f"{city1.replace(' ', '_')}_{city2.replace(' ', '_')}.geojson"
        filepath = os.path.join(output_dir, filename)
        
        # Save to GeoJSON file
        with open(filepath, 'w') as f:
            json.dump(geojson_data, f, indent=4)
        
        print(f"GeoJSON file saved: {filepath}")
    else:
        print(f"Skipping {city1} to {city2}: Not enough valid coordinates")

print("GeoJSON files generated successfully.")

In [None]:
#lines and points separately
import pandas as pd
import json
import os

# File paths
csv_file = 'data/trips_data_with_stops_final.csv'
lines_dir = 'geojson_files/lines'
points_dir = 'geojson_files/points'

# Ensure output directories exist
os.makedirs(lines_dir, exist_ok=True)
os.makedirs(points_dir, exist_ok=True)

# Load the CSV file
df = pd.read_csv(csv_file, sep=';')

# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    city1 = row["City_1"]
    city2 = row["City_2"]
    
    # Prepare coordinates list for LineString
    line_coordinates = []
    
    # Prepare list of Point features
    point_features = []
    
    # Iterate through possible stops (up to 25 stops)
    for i in range(1, 26):
        stop_key = f"{i}_stop"
        lat_key = f"{i}_stop_lat"
        lon_key = f"{i}_stop_lon"
        
        if pd.notna(row[lat_key]) and pd.notna(row[lon_key]):
            # Add to LineString coordinates
            line_coordinates.append([row[lon_key], row[lat_key]])
            
            # Create a point feature
            point_feature = {
                "type": "Feature",
                "geometry": {
                    "type": "Point",
                    "coordinates": [row[lon_key], row[lat_key]]
                },
                "properties": {
                    "Start": city1,
                    "End": city2,
                    "stop_name": row[stop_key],
                    "latitude": row[lat_key],
                    "longitude": row[lon_key]
                }
            }
            point_features.append(point_feature)
    
    # Generate filenames
    base_filename = f"{city1.replace(' ', '_')}_{city2.replace(' ', '_')}.geojson"
    
    # Create LineString GeoJSON if there are enough coordinates
    if len(line_coordinates) > 1:
        line_geojson = {
            "type": "FeatureCollection",
            "features": [
                {
                    "type": "Feature",
                    "geometry": {
                        "type": "LineString",
                        "coordinates": line_coordinates
                    },
                    "properties": {
                        "Start": city1,
                        "End": city2
                    }
                }
            ]
        }
        line_filepath = os.path.join(lines_dir, base_filename)
        with open(line_filepath, 'w') as f:
            json.dump(line_geojson, f, indent=4)
        print(f"LineString GeoJSON saved: {line_filepath}")
    
    # Create Points GeoJSON if there are valid points
    if point_features:
        points_geojson = {
            "type": "FeatureCollection",
            "features": point_features
        }
        points_filepath = os.path.join(points_dir, base_filename)
        with open(points_filepath, 'w') as f:
            json.dump(points_geojson, f, indent=4)
        print(f"Points GeoJSON saved: {points_filepath}")

print("GeoJSON files generated successfully.")

In [None]:
#filter transfer points that are too close to each other (<5km)
import os
import json
from geopy.distance import geodesic

# File paths
input_dir = 'geojson_files/points'
output_dir = 'geojson_files/points_filtered'

# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Function to calculate distance between two coordinates
def calculate_distance(coord1, coord2):
    return geodesic(coord1, coord2).kilometers

# Function to filter points within 5km of start and end
def filter_points(geojson_data):
    features = geojson_data["features"]
    
    if not features:
        return geojson_data

    # Identify start and end points
    start_point = features[0]["geometry"]["coordinates"]
    end_point = features[-1]["geometry"]["coordinates"]

    filtered_features = []
    start_added = False
    end_added = False

    for feature in features:
        point = feature["geometry"]["coordinates"]
        
        # Keep the first point near the start
        if not start_added and calculate_distance(point, start_point) <= 5:
            filtered_features.append(feature)
            start_added = True
        
        # Keep points that are not within 5km of the start or end
        elif calculate_distance(point, start_point) > 5 and calculate_distance(point, end_point) > 5:
            filtered_features.append(feature)
        
        # Keep the last point near the end
        if not end_added and calculate_distance(point, end_point) <= 5:
            end_added = True

    # Ensure the final end point is added
    if not end_added:
        filtered_features.append(features[-1])
    
    return {
        "type": "FeatureCollection",
        "features": filtered_features
    }

# Process each GeoJSON file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.geojson'):
        input_filepath = os.path.join(input_dir, filename)
        
        # Load the GeoJSON data
        with open(input_filepath, 'r') as f:
            geojson_data = json.load(f)
        
        # Filter points and generate new GeoJSON
        filtered_geojson = filter_points(geojson_data)
        
        # Save the filtered GeoJSON to the output directory
        output_filepath = os.path.join(output_dir, filename)
        with open(output_filepath, 'w') as f:
            json.dump(filtered_geojson, f, indent=4)
        
        print(f"Filtered GeoJSON file saved: {output_filepath}")

print("Filtered GeoJSON files generated successfully.")

In [None]:
import os
import json
from geopy.distance import geodesic

def filter_points(input_dir='geojson_files/points', output_dir='geojson_files/points_filtered', buffer_km=5):
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Iterate through each GeoJSON file in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('.geojson'):
            input_path = os.path.join(input_dir, filename)
            
            # Load the GeoJSON file
            with open(input_path, 'r') as f:
                data = json.load(f)

            if data['features']:
                # Get the first point (start) and the last point (end)
                start_point = data['features'][0]
                end_point = data['features'][-1]

                start_coords = start_point['geometry']['coordinates']
                end_coords = end_point['geometry']['coordinates']

                filtered_features = []
                added_start = False
                added_end = False

                # Iterate through the features and filter based on distance from start and end points
                for feature in data['features']:
                    coordinates = feature['geometry']['coordinates']
                    distance_from_start = geodesic(start_coords[::-1], coordinates[::-1]).km
                    distance_from_end = geodesic(end_coords[::-1], coordinates[::-1]).km

                    if distance_from_start > buffer_km and distance_from_end > buffer_km:
                        filtered_features.append(feature)
                    elif distance_from_start <= buffer_km and not added_start:
                        filtered_features.append(start_point)  # Ensure the first point (start) is kept
                        added_start = True
                    elif distance_from_end <= buffer_km and not added_end:
                        filtered_features.append(end_point)  # Ensure the last point (end) is kept
                        added_end = True

                # Update the filtered GeoJSON data
                data['features'] = filtered_features

                # Save the filtered GeoJSON
                output_path = os.path.join(output_dir, filename)
                with open(output_path, 'w') as f:
                    json.dump(data, f, indent=4)

                print(f"Filtered GeoJSON file saved: {output_path}")

filter_points()

In [None]:
import os
import json
from geopy.distance import geodesic

def filter_points(input_dir='geojson_files/points', output_dir='geojson_files/points_filtered_stopnum', buffer_km=5):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for filename in os.listdir(input_dir):
        if filename.endswith(".geojson"):
            filepath = os.path.join(input_dir, filename)
            
            with open(filepath, 'r') as f:
                geojson_data = json.load(f)
            
            # Separate start and end city
            start_city = geojson_data["features"][0]["properties"]["Start"]
            end_city = geojson_data["features"][0]["properties"]["End"]
            
            # Get the coordinates for start and end points
            start_point = geojson_data["features"][0]["geometry"]["coordinates"]
            end_point = geojson_data["features"][-1]["geometry"]["coordinates"]
            
            filtered_features = []
            stop_number = 1  # Initialize stop number
            
            # Loop through each point feature
            for i, feature in enumerate(geojson_data["features"]):
                coordinates = feature["geometry"]["coordinates"]
                stop_name = feature["properties"]["stop_name"]
                
                # Calculate distances to start and end points
                dist_to_start = geodesic(start_point[::-1], coordinates[::-1]).km
                dist_to_end = geodesic(end_point[::-1], coordinates[::-1]).km
                
                # Keep the first point in the start city, all non-start/end city points, and the last point
                if (dist_to_start > buffer_km or i == 0) and (dist_to_end > buffer_km or i == len(geojson_data["features"]) - 1):
                    # Add stop_number to the properties
                    feature["properties"]["stop_number"] = stop_number
                    stop_number += 1
                    filtered_features.append(feature)
            
            # Update the GeoJSON structure
            geojson_data["features"] = filtered_features
            
            # Save the filtered GeoJSON file
            output_filepath = os.path.join(output_dir, filename)
            with open(output_filepath, 'w') as f:
                json.dump(geojson_data, f, indent=4)
            
            print(f"Filtered GeoJSON saved: {output_filepath}")

filter_points()

In [None]:
import os
import json
from geopy.distance import geodesic

# Function to remove redundant points within 5 km for all stops along the route
def filter_redundant_points_full_route(input_filepath, output_filepath):
    with open(input_filepath, 'r') as f:
        geojson_data = json.load(f)
    
    features = geojson_data["features"]
    filtered_features = []
    
    # Function to check if two points are within the 5 km radius
    def is_within_5km(coord1, coord2):
        return geodesic(coord1, coord2).km <= 5
    
    # Keep track of the last added point to filter redundant points
    last_point = None
    
    # Iterate through all points
    for feature in features:
        if feature["geometry"]["type"] == "Point":
            current_point = feature["geometry"]["coordinates"]
            
            if last_point is None or not is_within_5km(current_point, last_point):
                
                # Add the point to the filtered list and update the last_point
                filtered_features.append(feature)
                last_point = current_point
    
    # Update the GeoJSON data with filtered features
    geojson_data["features"] = filtered_features
    
    # Save the filtered points to a new GeoJSON file
    with open(output_filepath, 'w') as f:
        json.dump(geojson_data, f, indent=4)

# Directories
input_dir = 'geojson_files/points/'
output_dir = 'geojson_files/points_new/'

# Ensure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Process all GeoJSON files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".geojson"):  # Only process GeoJSON files
        input_filepath = os.path.join(input_dir, filename)
        output_filepath = os.path.join(output_dir, filename)
        
        # Filter redundant points for each route
        filter_redundant_points_full_route(input_filepath, output_filepath)
        print(f"Processed: {filename}")

print("All files processed and saved successfully.")

## Alternative data source - [EcoPassenger](https://ecopassenger.hafas.de/bin/query.exe/en?L=vs_uic) parser (not used in the end)

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://ecopassenger.hafas.de/bin/query.exe/en?ld=uic-eco&L=vs_uic&protocol=https:&seqnr=1&ident=nt.0241101.1719352628&REQ0HafasScrollDir=1&ecocon=C1-0"

response = requests.get(url)

#Parse HTML using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')

#Find table row with CO2 data
rows = soup.find_all('tr')
for row in rows:
    if row.find('td', class_='sepline nowrap') and 'Carbon dioxide' in row.get_text():
        co2_row = row
        break

#Extract CO2 values for train and plane
train_co2 = co2_row.find_all('td')[1].text.strip()
plane_co2 = co2_row.find_all('td')[5].text.strip()

# Print the results
print(f"Train CO2: {train_co2}")
print(f"Plane CO2: {plane_co2}")


In [None]:
def parse_first_file(file_path):
    city_pairs = set()
    with open(file_path, 'r') as file:
        for line in file:
            if "completed:" in line:
                # Extract the part after the "completed:" keyword
                part = line.split("completed:")[1].strip()
                # Extract the city pair
                cities = part.split(" to ")
                if len(cities) == 2:
                    city_pairs.add((cities[0].strip(), cities[1].strip()))
    return city_pairs

def parse_second_file(file_path):
    city_pairs = set()
    with open(file_path, 'r') as file:
        for line in file:
            # Each line is in the format "City1,City2,..."
            parts = line.strip().split(",")
            if len(parts) >= 2:
                city_pairs.add((parts[0].strip(), parts[1].strip()))
    return city_pairs

def find_missing_pairs(first_file_path, second_file_path):
    first_file_pairs = parse_first_file(first_file_path)
    second_file_pairs = parse_second_file(second_file_path)
    
    # Find the difference
    missing_pairs = first_file_pairs - second_file_pairs
    return missing_pairs

# Replace these with your actual file paths
first_file_path = 'api_logs/misc/list_api_req.txt'
second_file_path = 'data/trips_data.csv'

missing_pairs = find_missing_pairs(first_file_path, second_file_path)

# Print missing pairs
for pair in missing_pairs:
    print(f"Missing pair: {pair[0]} to {pair[1]}")

In [None]:
pip install selenium

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

#Set up the WebDriver using ChromeDriverManager
chrome_options = Options()
chrome_options.add_argument("--lang=en-UK")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

#Input values
from_location = 'PARIS NORD (France)'
to_location = 'BERLIN (Germany)'
travel_date = '01.07.24'
travel_time = '06:00'

try:
    #Navigate to the EcoPassenger search page
    driver.get('https://ecopassenger.hafas.de/bin/query.exe/en?L=vs_uic&')

    from_input = driver.find_element(By.ID, 'from')
    from_input.clear()
    from_input.send_keys(from_location)

    to_input = driver.find_element(By.ID, 'to')
    to_input.clear()
    to_input.send_keys(to_location)

    date_input = driver.find_element(By.ID, 'date')
    date_input.clear()
    date_input.send_keys(travel_date)

    time_input = driver.find_element(By.ID, 'time')
    time_input.clear()
    time_input.send_keys(travel_time)
    
    #Click the 'Start request' button
    start_button = driver.find_element(By.CSS_SELECTOR, "button[name='application=ECOLOGYINFO&start']")
    start_button.click()

    #Wait for the results page to load and print the results
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".hafasEcology"))
    )
    
    #Pause for manual selection of the fastest train option
    input("Press Enter to continue after you do your thing...")
    
    #Get the page source and parse travel time and emissons for train and plane
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    durations = []
    summary_table = soup.find('table', class_='result')
    if summary_table:
        duration_rows = summary_table.find_all('td', class_='sepline borderright top')
        for duration_row in duration_rows:
            if duration_row.find('div', class_='lc_th') and 'Duration' in duration_row.find('div', class_='lc_th').text:
                duration = duration_row.text.split()[-1].strip()
                durations.append(duration)
    
    duration_train, _, duration_plane = durations
    
    rows = soup.find_all('tr')
    co2_row = None
    for row in rows:
        if row.find('td', class_='sepline nowrap') and 'Carbon dioxide' in row.get_text():
            co2_row = row
            break

    if co2_row:
        train_co2 = co2_row.find_all('td')[1].text.strip()
        plane_co2 = co2_row.find_all('td')[5].text.strip()
    else:
        print("CO2 data row not found")
        
    #Create a DataFrame for the results
    data = {
        "Start": [from_location],
        "End": [to_location],
        "Duration_train": [duration_train],
        "Train_CO2_kg": [train_co2],
        "Duration_plane": [duration_plane],
        "Plane_CO2_kg": [plane_co2],
        "Date": [travel_date]
    }
    df = pd.DataFrame(data)
    print(df)
    #df.to_csv('train_plane_emissions_durations.csv', index=False)

finally:
    driver.quit()