In [1]:
# Description

# TODO
# https://stackoverflow.com/questions/58257251/how-to-check-if-a-file-exists-in-another-folder

# Requirements
import os
import geopandas as gpd
import pandas as pd
import pathlib

# Parameters
data_foldername = 'data'
data_directory = pathlib.Path('.').absolute() / data_foldername
data_directory = str(data_directory)
data_directory = data_directory + '\\'

gpx_foldername = 'gpxfiles'
gpx_directory = pathlib.Path('.').absolute() / gpx_foldername

#new_gpx_foldername = 'new_gpxfiles'
#gpx_directory = pathlib.Path('.').absolute() / new_gpx_foldername 

#processed_gpx_foldername = 'processed_gpxfiles'
#processed_gpx_directory = pathlib.Path('.').absolute() / processed_gpx_foldername 

# List of GeoPackage files to read
gpkg_files = os.listdir(data_directory)

In [4]:
## Proces for admin regions 0 (country-level)

# Initialize an empty list to store GeoDataFrames
gdf_list = []

# Loop through each GeoPackage file and read the data
for gpkg_file in gpkg_files:
    # Read the GeoPackage file
    gdf_file = gpd.read_file(os.path.join(data_directory,  gpkg_file), layer='ADM_ADM_0')

    gdf_list.append(gdf_file)

gdf_adm0 = pd.concat(gdf_list, ignore_index=True)

# Reset the index of the merged GeoDataFrame
gdf_adm0 = gdf_adm0.reset_index(drop=True)

#gdf_adm0.explore()

# Initialize an empty list to store GeoDataFrames
gdf_adminregions_per_gpx = []

# Proces per gpx-file
#for filename in os.listdir(gpx_directory):
#    if filename.endswith('.gpx'):
#        gpxfile = os.path.join(gpx_directory, filename)
#        gpx = gpd.read_file(gpxfile, layer='tracks')
#        gdf_gpxline = gpx[gpx['name'].str.contains("Running")]
        
        # Perform the intersection
#        gdf_join = gpd.sjoin(left_df=gdf_adm0, right_df=gdf_gpxline,  how="inner", predicate="intersects")
        #print(gdf_join)
        
        #Append data to temp gdf
#        gdf_adminregions_per_gpx.append(gdf_join)

# Create gdf with information from all gpx-files
#gdf_runs_adm0 = gpd.GeoDataFrame( pd.concat(gdf_adminregions_per_gpx, ignore_index=True) )

# Initialize a DataFrame to keep track of stats
admin_region_stats = pd.DataFrame(columns=['admin_id', 'count', 'first_date', 'last_date'])

for filename in os.listdir(gpx_directory):
    if filename.endswith('.gpx'):
        gpxfile = os.path.join(gpx_directory, filename)
        gpx = gpd.read_file(gpxfile, layer='tracks')
        gdf_gpxline = gpx[gpx['name'].str.contains("Running")]
        date = filename[:10]
        gdf_gpxline['date'] = date
        run_date = date

        # Perform the intersection
        gdf_join = gpd.sjoin(left_df=gdf_adm0, right_df=gdf_gpxline, how="inner", predicate="intersects")
        # Get unique admin regions in this intersection
        admin_ids = gdf_join['GID_0'].unique()

        # Update stats for each admin region
        for admin_id in admin_ids:
            if admin_id in admin_region_stats['admin_id'].values:
                # Update count
                admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'count'] += 1
                # Update first_date if current run_date is earlier or if first_date is None
                if run_date:
                    current_first_date = admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'first_date'].iloc[0]
                    if current_first_date is None or run_date < current_first_date:
                        admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'first_date'] = run_date
                    # Update last_date if current run_date is later or if last_date is None
                    current_last_date = admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'last_date'].iloc[0]
                    if current_last_date is None or run_date > current_last_date:
                        admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'last_date'] = run_date
            else:
                # Add new entry
                admin_region_stats.loc[len(admin_region_stats)] = [admin_id, 1, run_date, run_date]

        # Append data to temp gdf
        gdf_adminregions_per_gpx.append(gdf_join)


# Create gdf with information from all gpx-files
gdf_runs_adm0 = gpd.GeoDataFrame(pd.concat(gdf_adminregions_per_gpx, ignore_index=True))

# Merge stats back into the main GeoDataFrame
gdf_runs_adm0 = gdf_runs_adm0.merge(
    admin_region_stats,
    left_on='GID_0',
    right_on='admin_id',
    how='left'
)

# Select columns
#col_list = ['GID_0', 'COUNTRY', 'geometry']
col_list = ['COUNTRY', 'count', 'first_date', 'last_date', 'geometry']
gdf_runs_adm0 = gdf_runs_adm0[col_list]
gdf_runs_adm0.columns = gdf_runs_adm0.columns.str.lower()

gdf_runs_adm0 = gdf_runs_adm0.drop_duplicates(["geometry"])

# Simplify geometries with a tolerance (in units of the CRS)
# The higher the tolerance, the more simplified the geometry
tolerance = 0.001  # Adjust as needed
gdf_runs_adm0['geometry'] = gdf_runs_adm0['geometry'].simplify(tolerance)

# Write geojsonfile
if(os.path.isfile("adm_0.geojson")):
    os.remove("adm_0.geojson")
    print("File Deleted successfully")
else:
    print("File does not exist")

gdf_runs_adm0.to_file("adm_0.geojson", driver='GeoJSON')

#gdf_runs_adm0.explore()
#gdf_runs_adm0.head()

File Deleted successfully


In [16]:
## Proces for admin regions 1 (province-level)

# Initialize an empty list to store GeoDataFrames
gdf_list = []

# Loop through each GeoPackage file and read the data
for gpkg_file in gpkg_files:
    # Read the GeoPackage file
    gdf_file = gpd.read_file(os.path.join(data_directory,  gpkg_file), layer='ADM_ADM_1')
    gdf_list.append(gdf_file)

gdf_adm1 = pd.concat(gdf_list, ignore_index=True)

gdf_adm1 = gdf_adm1[~gdf_adm1['ENGTYPE_1'].str.contains("Water body")]

# Reset the index of the merged GeoDataFrame
gdf_adm1 = gdf_adm1.reset_index(drop=True)

# Initialize an empty list to store GeoDataFrames
gdf_adminregions_per_gpx = []

# Proces per gpx-file
#for filename in os.listdir(gpx_directory):
#    if filename.endswith('.gpx'):
#        gpxfile = os.path.join(gpx_directory, filename)
#        gpx = gpd.read_file(gpxfile, layer='tracks')
#        gdf_gpxline = gpx[gpx['name'].str.contains("Running")]

        # Perform the intersection
#        gdf_join = gpd.sjoin(left_df=gdf_adm1, right_df=gdf_gpxline,  how="inner", predicate="intersects")
        #print(gdf_join)
        
        #Append data to temp gdf
#        gdf_adminregions_per_gpx.append(gdf_join)

# Initialize a DataFrame to keep track of stats
admin_region_stats = pd.DataFrame(columns=['admin_id', 'count', 'first_date', 'last_date'])

for filename in os.listdir(gpx_directory):
    if filename.endswith('.gpx'):
        gpxfile = os.path.join(gpx_directory, filename)
        gpx = gpd.read_file(gpxfile, layer='tracks')
        gdf_gpxline = gpx[gpx['name'].str.contains("Running")]
        date = filename[:10]
        gdf_gpxline['date'] = date
        run_date = date

        # Perform the intersection
        gdf_join = gpd.sjoin(left_df=gdf_adm1, right_df=gdf_gpxline, how="inner", predicate="intersects")
        # Get unique admin regions in this intersection
        admin_ids = gdf_join['GID_1'].unique()

        # Update stats for each admin region
        for admin_id in admin_ids:
            if admin_id in admin_region_stats['admin_id'].values:
                # Update count
                admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'count'] += 1
                # Update first_date if current run_date is earlier or if first_date is None
                if run_date:
                    current_first_date = admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'first_date'].iloc[0]
                    if current_first_date is None or run_date < current_first_date:
                        admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'first_date'] = run_date
                    # Update last_date if current run_date is later or if last_date is None
                    current_last_date = admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'last_date'].iloc[0]
                    if current_last_date is None or run_date > current_last_date:
                        admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'last_date'] = run_date
            else:
                # Add new entry
                admin_region_stats.loc[len(admin_region_stats)] = [admin_id, 1, run_date, run_date]

        # Append data to temp gdf
        gdf_adminregions_per_gpx.append(gdf_join)


# Create gdf with information from all gpx-files
gdf_runs_adm1 = gpd.GeoDataFrame( pd.concat(gdf_adminregions_per_gpx, ignore_index=True) )

# Merge stats back into the main GeoDataFrame
gdf_runs_adm1 = gdf_runs_adm1.merge(
    admin_region_stats,
    left_on='GID_1',
    right_on='admin_id',
    how='left'
)

# Select columns
col_list = ['COUNTRY', 'NAME_1', 'TYPE_1', 'count', 'first_date', 'last_date', 'geometry']
gdf_runs_adm1 = gdf_runs_adm1[col_list]
gdf_runs_adm1 = gdf_runs_adm1.rename(columns={"NAME_1": "NAME"})
gdf_runs_adm1 = gdf_runs_adm1.rename(columns={"TYPE_1": "TYPE"})
gdf_runs_adm1.columns = gdf_runs_adm1.columns.str.lower()

gdf_runs_adm1 = gdf_runs_adm1.drop_duplicates(["geometry"])

# Simplify geometries with a tolerance (in units of the CRS)
# The higher the tolerance, the more simplified the geometry
tolerance = 0.001  # Adjust as needed
gdf_runs_adm1['geometry'] = gdf_runs_adm1['geometry'].simplify(tolerance)

# Write geojsonfile
if(os.path.isfile("adm_1.geojson")):
    os.remove("adm_1.geojson")
    print("File Deleted successfully")
else:
    print("File does not exist")

gdf_runs_adm1.to_file("adm_1.geojson", driver='GeoJSON')

File Deleted successfully


In [17]:
## Proces for admin regions 2 (municipality-level)

# Initialize an empty list to store GeoDataFrames
gdf_list = []

# Loop through each GeoPackage file and read the data
for gpkg_file in gpkg_files:
    # Read the GeoPackage file
    gdf_file = gpd.read_file(os.path.join(data_directory,  gpkg_file), layer='ADM_ADM_2')
    gdf_list.append(gdf_file)

gdf_adm2 = pd.concat(gdf_list, ignore_index=True)

gdf_adm2 = gdf_adm2[~gdf_adm2['ENGTYPE_2'].str.contains("Water body")]

# Reset the index of the merged GeoDataFrame
gdf_adm2 = gdf_adm2.reset_index(drop=True)

# Initialize an empty list to store GeoDataFrames
gdf_adminregions_per_gpx = []

# Proces per gpx-file
#for filename in os.listdir(gpx_directory):
#    if filename.endswith('.gpx'):
#        gpxfile = os.path.join(gpx_directory, filename)
#        gpx = gpd.read_file(gpxfile, layer='tracks')
#        gdf_gpxline = gpx[gpx['name'].str.contains("Running")]
        
        # Perform the intersection
#        gdf_join = gpd.sjoin(left_df=gdf_adm2, right_df=gdf_gpxline,  how="inner", predicate="intersects")
        #print(gdf_join)
        
        #Append data to temp gdf
#        gdf_adminregions_per_gpx.append(gdf_join)

# Initialize a DataFrame to keep track of stats
admin_region_stats = pd.DataFrame(columns=['admin_id', 'count', 'first_date', 'last_date'])

for filename in os.listdir(gpx_directory):
    if filename.endswith('.gpx'):
        gpxfile = os.path.join(gpx_directory, filename)
        gpx = gpd.read_file(gpxfile, layer='tracks')
        gdf_gpxline = gpx[gpx['name'].str.contains("Running")]
        date = filename[:10]
        gdf_gpxline['date'] = date
        run_date = date

        # Perform the intersection
        gdf_join = gpd.sjoin(left_df=gdf_adm2, right_df=gdf_gpxline, how="inner", predicate="intersects")
        # Get unique admin regions in this intersection
        admin_ids = gdf_join['GID_2'].unique()

        # Update stats for each admin region
        for admin_id in admin_ids:
            if admin_id in admin_region_stats['admin_id'].values:
                # Update count
                admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'count'] += 1
                # Update first_date if current run_date is earlier or if first_date is None
                if run_date:
                    current_first_date = admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'first_date'].iloc[0]
                    if current_first_date is None or run_date < current_first_date:
                        admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'first_date'] = run_date
                    # Update last_date if current run_date is later or if last_date is None
                    current_last_date = admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'last_date'].iloc[0]
                    if current_last_date is None or run_date > current_last_date:
                        admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'last_date'] = run_date
            else:
                # Add new entry
                admin_region_stats.loc[len(admin_region_stats)] = [admin_id, 1, run_date, run_date]

        # Append data to temp gdf
        gdf_adminregions_per_gpx.append(gdf_join)

# Create gdf with information from all gpx-files
gdf_runs_adm2 = gpd.GeoDataFrame( pd.concat(gdf_adminregions_per_gpx, ignore_index=True) )

# Merge stats back into the main GeoDataFrame
gdf_runs_adm2 = gdf_runs_adm2.merge(
    admin_region_stats,
    left_on='GID_2',
    right_on='admin_id',
    how='left'
)

# Select columns
col_list = ['COUNTRY', 'NAME_2', 'TYPE_2', 'count', 'first_date', 'last_date', 'geometry']
gdf_runs_adm2 = gdf_runs_adm2[col_list]
gdf_runs_adm2 = gdf_runs_adm2.rename(columns={"NAME_2": "NAME"})
gdf_runs_adm2 = gdf_runs_adm2.rename(columns={"TYPE_2": "TYPE"})
gdf_runs_adm2.columns = gdf_runs_adm2.columns.str.lower()

gdf_runs_adm2 = gdf_runs_adm2.drop_duplicates(["geometry"])

# Simplify geometries with a tolerance (in units of the CRS)
# The higher the tolerance, the more simplified the geometry
tolerance = 0.001  # Adjust as needed
gdf_runs_adm2['geometry'] = gdf_runs_adm2['geometry'].simplify(tolerance)

# Write geojsonfile
if(os.path.isfile("adm_2.geojson")):
    os.remove("adm_2.geojson")
    print("File Deleted successfully")
else:
    print("File does not exist")

gdf_runs_adm2.to_file("adm_2.geojson", driver='GeoJSON')


File Deleted successfully


In [18]:
## Proces for admin regions 3 (district-level)

# Initialize an empty list to store GeoDataFrames
gdf_list = []

# Specify the layer name you want to check
specific_layer = 'ADM_ADM_3'

# Loop through each GeoPackage file and read the data
for gpkg_file in gpkg_files:
    try:
        # Attempt to read the specific layer
        gdf_file = gpd.read_file(os.path.join(data_directory, gpkg_file), layer=specific_layer)
        gdf_list.append(gdf_file)
        print(f"Layer '{specific_layer}' read from '{gpkg_file}'.")
    except ValueError as e:
        # Handle the case where the layer does not exist
        print(f"Layer '{specific_layer}' not found in '{gpkg_file}': {e}")

# Concatenate the GeoDataFrames if any were found
if gdf_list:
    gdf_adm3 = pd.concat(gdf_list, ignore_index=True)
    print("Combined GeoDataFrame created.")
else:
    print(f"No GeoPackages contained the layer '{specific_layer}'.")

gdf_adm3 = gdf_adm3[~gdf_adm3['ENGTYPE_3'].str.contains("Water body")]

# Reset the index of the merged GeoDataFrame
gdf_adm3 = gdf_adm3.reset_index(drop=True)

# Initialize an empty list to store GeoDataFrames
gdf_adminregions_per_gpx = []

# Proces per gpx-file
#for filename in os.listdir(gpx_directory):
#    if filename.endswith('.gpx'):
#        gpxfile = os.path.join(gpx_directory, filename)
#        gpx = gpd.read_file(gpxfile, layer='tracks')
#        gdf_gpxline = gpx[gpx['name'].str.contains("Running")]
        
        # Perform the intersection
#        gdf_join = gpd.sjoin(left_df=gdf_adm3, right_df=gdf_gpxline,  how="inner", predicate="intersects")
        #print(gdf_join)
        
        #Append data to temp gdf
#        gdf_adminregions_per_gpx.append(gdf_join)

# Initialize a DataFrame to keep track of stats
admin_region_stats = pd.DataFrame(columns=['admin_id', 'count', 'first_date', 'last_date'])

for filename in os.listdir(gpx_directory):
    if filename.endswith('.gpx'):
        gpxfile = os.path.join(gpx_directory, filename)
        gpx = gpd.read_file(gpxfile, layer='tracks')
        gdf_gpxline = gpx[gpx['name'].str.contains("Running")]
        date = filename[:10]
        gdf_gpxline['date'] = date
        run_date = date

        # Perform the intersection
        gdf_join = gpd.sjoin(left_df=gdf_adm3, right_df=gdf_gpxline, how="inner", predicate="intersects")
        # Get unique admin regions in this intersection
        admin_ids = gdf_join['GID_3'].unique()

        # Update stats for each admin region
        for admin_id in admin_ids:
            if admin_id in admin_region_stats['admin_id'].values:
                # Update count
                admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'count'] += 1
                # Update first_date if current run_date is earlier or if first_date is None
                if run_date:
                    current_first_date = admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'first_date'].iloc[0]
                    if current_first_date is None or run_date < current_first_date:
                        admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'first_date'] = run_date
                    # Update last_date if current run_date is later or if last_date is None
                    current_last_date = admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'last_date'].iloc[0]
                    if current_last_date is None or run_date > current_last_date:
                        admin_region_stats.loc[admin_region_stats['admin_id'] == admin_id, 'last_date'] = run_date
            else:
                # Add new entry
                admin_region_stats.loc[len(admin_region_stats)] = [admin_id, 1, run_date, run_date]

        # Append data to temp gdf
        gdf_adminregions_per_gpx.append(gdf_join)

# Create gdf with information from all gpx-files
gdf_runs_adm3 = gpd.GeoDataFrame( pd.concat(gdf_adminregions_per_gpx, ignore_index=True) )

# Merge stats back into the main GeoDataFrame
gdf_runs_adm3 = gdf_runs_adm3.merge(
    admin_region_stats,
    left_on='GID_3',
    right_on='admin_id',
    how='left'
)

# Select columns
col_list = ['COUNTRY', 'NAME_3', 'TYPE_3', 'count', 'first_date', 'last_date', 'geometry']
gdf_runs_adm3 = gdf_runs_adm3[col_list]
gdf_runs_adm3 = gdf_runs_adm3.rename(columns={"NAME_3": "NAME"})
gdf_runs_adm3 = gdf_runs_adm3.rename(columns={"TYPE_3": "TYPE"})
gdf_runs_adm3.columns = gdf_runs_adm3.columns.str.lower()

gdf_runs_adm3 = gdf_runs_adm3.drop_duplicates(["geometry"])

# Simplify geometries with a tolerance (in units of the CRS)
# The higher the tolerance, the more simplified the geometry
tolerance = 0.001  # Adjust as needed
gdf_runs_adm3['geometry'] = gdf_runs_adm3['geometry'].simplify(tolerance)

# Write geojsonfile
if(os.path.isfile("adm_3.geojson")):
    os.remove("adm_3.geojson")
    print("File Deleted successfully")
else:
    print("File does not exist")

gdf_runs_adm3.to_file("adm_3.geojson", driver='GeoJSON')

Layer 'ADM_ADM_3' read from 'gadm41_AUT.gpkg'.
Layer 'ADM_ADM_3' read from 'gadm41_BEL.gpkg'.
Layer 'ADM_ADM_3' read from 'gadm41_CHE.gpkg'.
Layer 'ADM_ADM_3' not found in 'gadm41_CZE.gpkg': Null layer: 'ADM_ADM_3'
Layer 'ADM_ADM_3' read from 'gadm41_DEU.gpkg'.
Layer 'ADM_ADM_3' not found in 'gadm41_DNK.gpkg': Null layer: 'ADM_ADM_3'
Layer 'ADM_ADM_3' read from 'gadm41_ESP.gpkg'.
Layer 'ADM_ADM_3' read from 'gadm41_FRA.gpkg'.
Layer 'ADM_ADM_3' read from 'gadm41_ITA.gpkg'.
Layer 'ADM_ADM_3' read from 'gadm41_LUX.gpkg'.
Layer 'ADM_ADM_3' read from 'gadm41_MWI.gpkg'.
Layer 'ADM_ADM_3' not found in 'gadm41_NLD.gpkg': Null layer: 'ADM_ADM_3'
Layer 'ADM_ADM_3' read from 'gadm41_POL.gpkg'.
Layer 'ADM_ADM_3' not found in 'gadm41_SVK.gpkg': Null layer: 'ADM_ADM_3'
Layer 'ADM_ADM_3' not found in 'gadm41_SVN.gpkg': Null layer: 'ADM_ADM_3'
Layer 'ADM_ADM_3' not found in 'gadm41_SWE.gpkg': Null layer: 'ADM_ADM_3'
Combined GeoDataFrame created.
File Deleted successfully


In [13]:
# Useful statements

# import fiona
# layers = fiona.listlayers(file)

In [14]:
'''
# Move data from new_gpxfiles dir to processed_gpxfiles dir
# gather all files in source folder
allfiles = os.listdir(gpx_directory)
 
# iterate on all files to move them to destination folder
for f in allfiles:
    src_path = os.path.join(gpx_directory, f)
    dst_path = os.path.join(processed_gpx_directory, f)
    os.rename(src_path, dst_path)
    '''

'\n# Move data from new_gpxfiles dir to processed_gpxfiles dir\n# gather all files in source folder\nallfiles = os.listdir(gpx_directory)\n \n# iterate on all files to move them to destination folder\nfor f in allfiles:\n    src_path = os.path.join(gpx_directory, f)\n    dst_path = os.path.join(processed_gpx_directory, f)\n    os.rename(src_path, dst_path)\n    '