# Creating a parser function to manipulate gpx files

Our objecive is to create a function that can be fed *gpx* files and will compose a dataframe containing the most useful information of said route, as well as changing the name of the *gpx* file to one more easily accessed.

In [3]:
#Importing our libraries.

import pandas as pd
import time
import re
import pathlib
import os
import gpxpy
import gpxpy.gpx
import time
from pathlib import Path
import os
import random, string
from geopy.geocoders import Nominatim
import reverse_geocode
import haversine as hs
import requests
from shapely.geometry import mapping, shape
from shapely.prepared import prep
from shapely.geometry import Point

In [2]:
#Importing our list of names.

names = pd.read_csv('names.csv')
alpha = names['names'].tolist()

## Keeping only routes inside Spain

For the moment we only need routes that begin inside **Spain**, so it becomes necessary to determine a route's point of origin.

In [4]:
#This function uses a Shapely file to determine if a point is within a country, and returns it.

data = requests.get("https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson").json()

countries = {}
for feature in data["features"]:
    geom = feature["geometry"]
    country = feature["properties"]["ADMIN"]
    countries[country] = prep(shape(geom))

print(len(countries))

def get_country(lon, lat):
    point = Point(lon, lat)
    for country, geom in countries.items():
        if geom.contains(point):
            return country

    return "Unknown"

print(get_country(10.0, 47.0))

255
Austria


# Final function

After much trial and error, additions and optimization this is the function I've come up with. 

In [None]:
#Rewriting our parser function with the new method for determining start of a route:

def parser(file, name):
    try:
        gpx_file = open(file, 'r') 
        gpx = gpxpy.parse(gpx_file) 
        data = gpx.tracks[0].segments[0].points
        country = get_country(data[0].longitude, data[0].latitude)
        parsed_file = {'name': name,
                       'original_name': gpx.name,
                        'source': gpx.creator,
                        'country': country,
                        'start': str(tuple([data[0].latitude, data[0].longitude])),
                        'end': str(tuple([data[-1].latitude, data[-1].longitude])),
                        'distance': gpx.length_3d()/1000, 
                        'climb': int(gpx.get_uphill_downhill()[0]),
                        'min_alt': int(gpx.get_elevation_extremes()[0]),
                        'max_alt': int(gpx.get_elevation_extremes()[1])}
        if country == 'Spain':
            with open('Spain/' + name + ".gpx", "w") as f:
                f.write(gpx.to_xml())
                return parsed_file
    except:
        gpx_file = open(file, 'r', encoding='utf-8') 
        gpx = gpxpy.parse(gpx_file) 
        data = gpx.tracks[0].segments[0].points
        country = get_country(data[0].longitude, data[0].latitude)
        parsed_file = {'name': name,
                       'original_name': gpx.name,
                        'source': gpx.creator,
                        'country': country,
                        'start': str(tuple([data[0].latitude, data[0].longitude])),
                        'end': str(tuple([data[-1].latitude, data[-1].longitude])),
                        'distance': gpx.length_3d()/1000, 
                        'climb': int(gpx.get_uphill_downhill()[0]),
                        'min_alt': int(gpx.get_elevation_extremes()[0]),
                        'max_alt': int(gpx.get_elevation_extremes()[1])}
        if country == 'Spain':
            with open('Spain/' + name + ".gpx", "w") as f:
                f.write(gpx.to_xml())
                return parsed_file
            
def gpx_creator():
    df = pd.read_csv('df.csv')
    """
    Input: none, but all target gpx files must be in a folder named 'gpx'.
    
    Output: new gpx files created as per 'parser' function.
    
    """
    start = time.time() #Starting our timer.

    directory = 'gpx' #The folder containing the gpx files.
    parsed_list = []
    
    files = Path(directory).glob('*') #Using all files in the folder as input.
    i = len(df) + 1
    for file in files:
        try:
            name = alpha[i]
            parsed = parser(file, name)
            if parsed != None:
                parsed_list.append(parsed)
                i = i+1
        except:
            pass

    stop = time.time() #Stopping our timer.
    duration = (stop - start) / 60
    
    df = pd.DataFrame(parsed_list)
    df['circular'] = None
    
    for i in range(len(df)):
        if hs.haversine(eval(df['start'].iloc[i]), eval(df['end'].iloc[i])) < 3:
            df['circular'].iloc[i] = True
        else:
            df['circular'].iloc[i] = False
        
    return df #Returning the elapsed minutes.


start = time.time()

df_new = gpx_creator()

stop = time.time() #Stopping our timer.
duration = (stop - start) / 60

print('Minutes:', duration)

df_new.to_csv('df_new.csv', index=False)