In [127]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import xml
import os
from math import radians, sin, cos, sqrt, atan2
import warnings

# Tests

In [None]:
print(xml.dom.minidom.parseString(ET.tostring(ET.parse('kmls/20201230Τρέξιμο έξω.kml').getroot())).toprettyxml())

In [None]:
kml_dict = {}

dates = []
kml_texts = []

for kml_file in os.listdir('./kmls'):
    date = kml_file[:4]+'_'+kml_file[4:6]+'_'+kml_file[6:8]
    kml_text = xml.dom.minidom.parseString(ET.tostring(ET.parse('./kmls/'+kml_file).getroot())).toprettyxml()
    dates.append(date)
    kml_texts.append(kml_text)

kml_dict['Date'] = dates
kml_dict['KML Text'] = kml_texts

In [None]:
kml_df = pd.DataFrame(kml_dict)
kml_df

In [None]:
def parse_coordinates(coordinates_str):
    coordinates = coordinates_str.split(',')
    if len(coordinates) == 3:
        return float(coordinates[0]), float(coordinates[1]), float(coordinates[2])
    elif len(coordinates) == 2:
        return float(coordinates[0]), float(coordinates[1]), None
    else:
        return None, None, None


def parse_kml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    data = []

    ns = {'ns0': 'http://earth.google.com/kml/2.1'}
    for placemark in root.findall('.//ns0:Placemark', namespaces=ns):
            timespan = placemark.find('.//ns0:TimeSpan', namespaces=ns)
            if timespan is not None:
                # Extracting timestamp
                begin_time_str = timespan.find('.//ns0:begin', namespaces=ns).text
                # Parsing timestamp into date and time
                date, time = begin_time_str.split('T')
                time = time.split('.')[0]  # Remove milliseconds

                # Extracting coordinates and altitude
                coordinates_str = placemark.find('.//ns0:coordinates', namespaces=ns).text
                lng, lat, altitude = parse_coordinates(coordinates_str)

                data.append({
                    'Date': date,
                    'Time': time,
                    'Lat': lat,
                    'Lng': lng,
                    'Altitude': altitude
                }) 
    return pd.DataFrame(data)

In [None]:
df_dict = {}

for kml_file in os.listdir('./kmls/')[:3]:
    df_dict[kml_file[:8]] = parse_kml('./kmls/'+kml_file)

In [None]:
pd.concat([df_dict['20211226'],df_dict['20220110']],axis=0)

# Working

### Run to get data in final_df

In [97]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
from datetime import datetime

def parse_coordinates(coordinates_str):
    coordinates = coordinates_str.split(',')
    if len(coordinates) == 3:
        return float(coordinates[0]), float(coordinates[1]), float(coordinates[2])
    elif len(coordinates) == 2:
        return float(coordinates[0]), float(coordinates[1]), None
    else:
        return None, None, None


def parse_kml(file_path):
    tree = ET.parse('./kmls/'+file_path)
    root = tree.getroot()

    data = []

    ns = {'ns0': 'http://earth.google.com/kml/2.1'}
    for placemark in root.findall('.//ns0:Placemark', namespaces=ns):
            timespan = placemark.find('.//ns0:TimeSpan', namespaces=ns)
            if timespan is not None:
                # Extracting timestamp
                begin_time_str = timespan.find('.//ns0:begin', namespaces=ns).text
                # Parsing timestamp into date and time
                date = datetime.strptime(file_path[:8], '%Y%m%d').date()
                time = begin_time_str.split('T')[1]
                time = time.split('.')[0]  # Remove milliseconds

                # Extracting coordinates and altitude
                coordinates_str = placemark.find('.//ns0:coordinates', namespaces=ns).text
                lng, lat, altitude = parse_coordinates(coordinates_str)

                data.append({
                    'Date': date,
                    'Time': time,
                    'Lat': lat,
                    'Lng': lng,
                    'Altitude': altitude
                }) 
    return pd.DataFrame(data)

df_dict = {}

for kml_file in os.listdir('./kmls/'):
    df_dict[kml_file[:8]] = parse_kml(kml_file)


final_df = pd.concat([df_dict[key] for key in df_dict.keys()],axis=0)

  final_df = pd.concat([df_dict[key] for key in df_dict.keys()],axis=0)


In [98]:
final_df.reset_index(inplace=True,drop=True)

### Experiment

In [105]:
df_date_dict = {}
for date in final_df['Date'].unique():
    df_date_dict[str(date)] = final_df[final_df['Date']==date]

In [125]:
def make_aggregations(df):

    warnings.filterwarnings("ignore")

    # Appropriate transformations
    df['Date'] = pd.to_datetime(df['Date'])
    df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S')

    # Calculations preparing for aggregation
    df['Total Uphill'] = df['Altitude'].diff().clip(lower=0)
    df['Total Altitude Difference'] = df['Altitude'].diff()
    df['Highest Peak'] = df['Altitude']
    df['Lowest Trough'] = df['Altitude']

    # Distance Calculation 
    df.sort_values(by=['Date','Time'], inplace=True)

    # Function to calculate Haversine distance
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371  # Radius of the Earth in kilometers

        # Convert latitude and longitude from degrees to radians
        lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

        # Haversine formula
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        distance = R * c

        return distance

    df['PrevLat'] = df['Lat'].shift(1)
    df['PrevLng'] = df['Lng'].shift(1)

    # Calculate distance between consecutive points
    df['Distance'] = df.apply(lambda row: haversine(row['Lat'], row['Lng'], row['PrevLat'], row['PrevLng']) if pd.notna(row['PrevLat']) else 0, axis=1)


    # Aggregation
    aggregated_df = df.groupby('Date').agg({
        'Time': lambda x: (x.max()-x.min()).seconds/60,
        'Total Uphill': 'sum',
        'Total Altitude Difference':'sum',
        'Highest Peak':'max',
        'Lowest Trough': 'min',
        'Distance':'sum',
    })
    aggregated_df.reset_index(inplace=True)

    return aggregated_df.values[0].tolist()


In [128]:
list_df = []
columns = ['Date', 'Time', 'Total Uphill', 'Total Altitude Difference','Highest Peak', 'Lowest Trough', 'Distance']
for date_df in df_date_dict.keys():
    list_df.append(make_aggregations(df_date_dict[date_df]))

aggregated_data = pd.DataFrame(np.array(list_df),columns = columns)
aggregated_data

Unnamed: 0,Date,Time,Total Uphill,Total Altitude Difference,Highest Peak,Lowest Trough,Distance
0,2021-12-26,28.633333,48.7,-1.1,74.7,52.8,4.80536
1,2024-01-03,22.616667,35.4,-0.9,98.6,86.7,4.034729
2,2022-01-10,22.683333,37.3,6.2,67.3,57.1,3.43584
3,2022-09-11,30.083333,70.0,2.7,140.5,122.2,4.568807
4,2022-09-05,42.216667,67.7,-3.3,82.5,31.6,6.995224
...,...,...,...,...,...,...,...
66,2022-10-04,27.233333,49.0,2.2,112.2,89.0,5.091855
67,2021-01-10,1436.666667,0.0,0.0,,,398.915318
68,2022-11-03,28.466667,47.7,0.7,112.1,88.3,5.65036
69,2022-02-07,24.416667,48.4,5.0,149.7,119.7,4.598013
