# Creating a parser function to manipulate gpx files

Our objecive is to create a function that can be fed *gpx* files and will compose a dataframe containing the most useful information of said route, as well as changing the name of the *gpx* file to one more easily accessed.

In [40]:
#Importing our libraries.

import pandas as pd
import time
import re
import pathlib
import os
import gpxpy
import gpxpy.gpx
import time
from pathlib import Path
import os
import random, string
from geopy.geocoders import Nominatim
from pprint import pprint

In [2]:
#Importing our list of names.

names = pd.read_csv('names.csv')
alpha = names['names'].tolist()

In [24]:
#Our parser function.

def parser(file, name):
    try:
        gpx_file = open(file, 'r') 
        gpx = gpxpy.parse(gpx_file) 
        coords = []
        for track in gpx.tracks:
            for segment in track.segments:        
                for point in segment.points:
                    coords.append(tuple([point.latitude, point.longitude]))
        parsed_file = {'name': name,
                       'original_name': track.name,
                        'source': gpx.creator,
                        'start': str(tuple([coords[0][0], coords[0][1]])),
                        'distance': track.length_3d()/1000, 
                        'climb': int(gpx.get_uphill_downhill()[0]),
                        'min_alt': int(gpx.get_elevation_extremes()[0]),
                        'max_alt': int(gpx.get_elevation_extremes()[1])}
        with open('generated_gpx/' + name + ".gpx", "w") as f:
            f.write(gpx.to_xml())
        return parsed_file
    except:
        gpx_file = open(file, 'r', encoding='utf-8') 
        gpx = gpxpy.parse(gpx_file) 
        coords = []
        for track in gpx.tracks:
            for segment in track.segments:        
                for point in segment.points:
                    coords.append(tuple([point.latitude, point.longitude]))
        parsed_file = {'name': name,
                       'original_name': track.name,
                        'source': gpx.creator,
                        'start': str(tuple([coords[0][0], coords[0][1]])),
                        'distance': track.length_3d()/1000, 
                        'climb': int(gpx.get_uphill_downhill()[0]),
                        'min_alt': int(gpx.get_elevation_extremes()[0]),
                        'max_alt': int(gpx.get_elevation_extremes()[1])}
        with open('generated_gpx/' + name + ".gpx", "w") as f:
            f.write(gpx.to_xml())
        return parsed_file

## Name generation

In [51]:
#Generating a list of names using alphanumeric characters.

alpha = []

for i in range(50000000):
    x = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(6))
    alpha.append(x)

In [121]:
#Dropping duplicate names.

alpha = set(alpha)

In [122]:
len(alpha)

49430049

In [61]:
#Storing our names as a dataframe for future use.

names = pd.DataFrame(set(alpha), columns =['names'])

In [63]:
#Storing it.

names.to_csv('names.csv', index=False)

## Final function

In [27]:
def gpx_creator():
    """
    Input: none, but all target gpx files must be in a folder named 'gpx'.
    
    Output: new gpx files created as per 'parser' function.
    
    """
    start = time.time() #Starting our fimer.

    directory = 'gpx' #The folder containing the gpx files.
    parsed_list = []
    
    files = Path(directory).glob('*') #Using all files in the folder as input.
    i = 0
    for file in files:
        name = alpha[i]
        parsed_list.append(parser(file, name)) #Applying the previous function to every file.
        i = i+1

    stop = time.time() #Stopping our timer.
    duration = (stop - start) / 60
    
    df = pd.DataFrame(parsed_list)
    return df #Returning the elapsed minutes.

In [17]:
start = time.time()

df = gpx_creator()

stop = time.time() #Stopping our timer.
duration = (stop - start) / 60

print('Minutes:', duration)

Minutes: 0.09244021972020468


In [18]:
df.head()

Unnamed: 0,name,original_name,source,start,distance,climb,min_alt,max_alt
0,54yu2w,besaide martxa 2011,Garmin Connect,"(43.0696788802743, -2.47713318094611)",[35.65579643643311],[1298],[193],[631]
1,00v5fe,Untitled,Garmin Connect,"(37.1465449780226, -3.56366956606507)",[28.30995863439158],[433],[680],[1054]
2,f4r9n2,Carretera costa Asterrika Baurdo,Garmin Connect,"(43.3455239608884, -2.48506112024188)",[34.621333051583576],[869],[6],[226]
3,lu6s5f,Circuito AlbuÃ±uelas,Garmin Connect,"(37.1134440042078, -3.61588572151959)",[71.29276829383188],[946],[503],[874]
4,rgxaz9,prova,Garmin Connect,"(41.6769059747458, 1.2808879930526)",[53.19353112278639],[677],[485],[767]


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           10000 non-null  object
 1   original_name  10000 non-null  object
 2   source         10000 non-null  object
 3   start          10000 non-null  object
 4   distance       10000 non-null  object
 5   climb          10000 non-null  object
 6   min_alt        10000 non-null  object
 7   max_alt        10000 non-null  object
dtypes: object(8)
memory usage: 625.1+ KB


## TESTING

In [35]:
#Our parser function.

def parser(file, name):
    try:
        gpx_file = open(file, 'r') 
        gpx = gpxpy.parse(gpx_file) 
        coords = []
        for track in gpx.tracks:
            for segment in track.segments:        
                for point in segment.points:
                    coords.append(tuple([point.latitude, point.longitude]))
        parsed_file = {'name': name,
                       'original_name': track.name,
                        'source': gpx.creator,
                        'start': str(tuple([coords[0][0], coords[0][1]])),
                        'distance': track.length_3d()/1000, 
                        'climb': int(gpx.get_uphill_downhill()[0]),
                        'min_alt': int(gpx.get_elevation_extremes()[0]),
                        'max_alt': int(gpx.get_elevation_extremes()[1])}
        with open('generated_gpx/' + name + ".gpx", "w") as f:
            f.write(gpx.to_xml())
        return parsed_file
    except:
        gpx_file = open(file, 'r', encoding='utf-8') 
        gpx = gpxpy.parse(gpx_file) 
        coords = []
        for track in gpx.tracks:
            for segment in track.segments:        
                for point in segment.points:
                    coords.append(tuple([point.latitude, point.longitude]))
        parsed_file = {'name': name,
                       'original_name': track.name,
                        'source': gpx.creator,
                        'start': str(tuple([coords[0][0], coords[0][1]])),
                        'distance': track.length_3d()/1000, 
                        'climb': int(gpx.get_uphill_downhill()[0]),
                        'min_alt': int(gpx.get_elevation_extremes()[0]),
                        'max_alt': int(gpx.get_elevation_extremes()[1])}
        with open('generated_gpx/' + name + ".gpx", "w") as f:
            f.write(gpx.to_xml())
        return parsed_file

In [36]:
start = time.time()

df = gpx_creator()

stop = time.time() #Stopping our timer.
duration = (stop - start) / 60

print('Minutes:', duration)

Minutes: 0.08963996569315592


In [37]:
df.head()

Unnamed: 0,name,original_name,source,start,distance,climb,min_alt,max_alt
0,54yu2w,besaide martxa 2011,Garmin Connect,"(43.0696788802743, -2.47713318094611)",35.655796,1298,193,631
1,00v5fe,Untitled,Garmin Connect,"(37.1465449780226, -3.56366956606507)",28.309959,433,680,1054
2,f4r9n2,Carretera costa Asterrika Baurdo,Garmin Connect,"(43.3455239608884, -2.48506112024188)",34.621333,869,6,226
3,lu6s5f,Circuito AlbuÃ±uelas,Garmin Connect,"(37.1134440042078, -3.61588572151959)",71.292768,946,503,874
4,rgxaz9,prova,Garmin Connect,"(41.6769059747458, 1.2808879930526)",53.193531,677,485,767


## Assigning country to each route

For this purpose we'll be using *Nominatim*.

In [39]:
#Initializing an instance.

app = Nominatim(user_agent="tutorial")

In [45]:
#Testing a function I found online:

def get_address_by_location(latitude, longitude, language="en"):
    """This function returns an address as raw from a location
    will repeat until success"""
    # build coordinates string to pass to reverse() function
    coordinates = f"{latitude}, {longitude}"
    # sleep for a second to respect Usage Policy
    time.sleep(1)
    try:
        return app.reverse(coordinates, language=language).raw
    except:
        return get_address_by_location(latitude, longitude)

In [46]:
#Testing the function.

address = get_address_by_location(41.6769059747458, 1.2808879930526)

In [91]:
#Accessing the country.

address['address']['country']

'Spain'

In [93]:
#Incorporating the function into our parser:

#Our parser function.

def parser(file, name):
    try:
        gpx_file = open(file, 'r') 
        gpx = gpxpy.parse(gpx_file) 
        coords = []
        for track in gpx.tracks:
            for segment in track.segments:        
                for point in segment.points:
                    coords.append(tuple([point.latitude, point.longitude]))
        country = get_address_by_location(coords[0][0], coords[0][1])['address']['country']
        parsed_file = {'name': name,
                       'original_name': track.name,
                        'source': gpx.creator,
                        'country': country,
                        'start': str(tuple([coords[0][0], coords[0][1]])),
                        'distance': track.length_3d()/1000, 
                        'climb': int(gpx.get_uphill_downhill()[0]),
                        'min_alt': int(gpx.get_elevation_extremes()[0]),
                        'max_alt': int(gpx.get_elevation_extremes()[1])}
        with open('generated_gpx/' + name + ".gpx", "w") as f:
            f.write(gpx.to_xml())
        return parsed_file
    except:
        gpx_file = open(file, 'r', encoding='utf-8') 
        gpx = gpxpy.parse(gpx_file) 
        coords = []
        for track in gpx.tracks:
            for segment in track.segments:        
                for point in segment.points:
                    coords.append(tuple([point.latitude, point.longitude]))
        country = get_address_by_location(coords[0][0], coords[0][1])['address']['country']
        parsed_file = {'name': name,
                       'original_name': track.name,
                        'source': gpx.creator,
                        'country': country,
                        'start': str(tuple([coords[0][0], coords[0][1]])),
                        'distance': track.length_3d()/1000, 
                        'climb': int(gpx.get_uphill_downhill()[0]),
                        'min_alt': int(gpx.get_elevation_extremes()[0]),
                        'max_alt': int(gpx.get_elevation_extremes()[1])}
        with open('generated_gpx/' + name + ".gpx", "w") as f:
            f.write(gpx.to_xml())
        return parsed_file

In [94]:
#Testing our new parser function:

start = time.time()

df = gpx_creator()

stop = time.time() #Stopping our timer.
duration = (stop - start) / 60

print('Minutes:', duration)

Minutes: 1.9374013145764668


In [97]:
df.head(5)

Unnamed: 0,name,original_name,source,country,start,distance,climb,min_alt,max_alt
0,54yu2w,besaide martxa 2011,Garmin Connect,Spain,"(43.0696788802743, -2.47713318094611)",35.655796,1298,193,631
1,00v5fe,Untitled,Garmin Connect,Spain,"(37.1465449780226, -3.56366956606507)",28.309959,433,680,1054
2,f4r9n2,Carretera costa Asterrika Baurdo,Garmin Connect,Spain,"(43.3455239608884, -2.48506112024188)",34.621333,869,6,226
3,lu6s5f,Circuito AlbuÃ±uelas,Garmin Connect,Spain,"(37.1134440042078, -3.61588572151959)",71.292768,946,503,874
4,rgxaz9,prova,Garmin Connect,Spain,"(41.6769059747458, 1.2808879930526)",53.193531,677,485,767


The function works fine but it takes a very long time to process each file. This is unacceptable, I will have to find a better (faster) way of determining a route's country of origin.