In [1]:
import elasticsearch
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

### The goal is to make an algorithm that will serve as a baseline . The performance of the future models will have to be superior to this one.

In [2]:
hosts = [{"host": 'localhost', "port": 9200}]
database = elasticsearch.Elasticsearch(hosts=hosts)

In [3]:
def get_datas(database):
    query = {
        "query": {
            "match_all": {}
        }
    }

    result = database.search(
        index="index_activity",
        size=1000,
        body= query
    )
    
    
    activities = result['hits']['hits']
    activities_json = [ activity.get("_source") for activity in activities]
    activities_df = pd.DataFrame(activities_json)
    activities_df = activities_df.drop(columns=['py/object', 'segment_efforts'])
    
    segments_json = [ segment for activity in activities_json for segment in activity.get("segment_efforts")]
    segments_df = pd.DataFrame(segments_json)
    segments_df = segments_df.drop(columns=['py/object'])
    
    return activities_df, segments_df

activities_df, segments_df = get_datas(database)

### Addition of the average speed for each of the segments

In [4]:
segments_df['average_speed'] = segments_df['distance'] / segments_df['elapsed_time']

In [5]:
activities_df.head(5)

Unnamed: 0,id,athlete_id,name,distance,moving_time,elapsed_time,total_elevation_gain,type,start_date_local,average_speed,average_cadence,average_watts,max_watts,suffer_score,calories,average_heart_rate,max_heart_rate
0,4831255939,10944546,Paris,13971.5,1414,1414,79.0,VirtualRide,2021-02-22T18:40:32Z,9.881,80.4,192.0,382.0,45.0,259.0,157.7,176.0
1,4831057316,10944546,Watopia,7086.5,1097,1097,135.0,VirtualRide,2021-02-22T18:10:34Z,6.46,79.1,177.4,230.0,20.0,186.0,145.6,165.0
2,4818094466,10944546,Sortie à vélo l'après-midi,44007.7,6084,6084,157.0,Ride,2021-02-20T14:31:34Z,7.233,,110.9,,275.0,1758.0,167.5,189.0
3,4801436758,10944546,Yorkshire,23068.3,2931,2931,435.0,VirtualRide,2021-02-17T13:32:49Z,7.87,77.3,179.8,440.0,112.0,502.0,162.9,193.0
4,4776486085,10944546,NYC,20694.8,2740,2740,377.0,VirtualRide,2021-02-12T18:36:15Z,7.553,77.6,171.0,335.0,128.0,447.0,169.0,194.0


In [6]:
segments_df.head(5)

Unnamed: 0,id,activity_id,athlete_id,name,type,elapsed_time,moving_time,start_date_local,distance,average_cadence,average_watts,average_grade,maximum_grade,climb_category,average_heart_rate,max_heart_rate,average_speed
0,24674235,4831255939,10944546,Champs-Élysées,VirtualRide,653,653,2021-02-22T18:46:25Z,6623.1,82.3,202.8,0.0,4.6,0,161.2,170.0,10.142573
1,27141377,4831057316,10944546,Epic KOM - Start to Bypass Intersection (Zwift...,VirtualRide,424,424,2021-02-22T18:19:40Z,1882.5,77.3,201.2,3.8,47.8,0,157.9,164.0,4.439858
2,22501195,4818094466,10944546,Villenave-Technopole,Ride,789,789,2021-02-20T14:39:32Z,5100.5,,108.0,0.5,6.4,0,169.0,184.0,6.464512
3,3566434,4818094466,10944546,Col du Chateau d'eau,Ride,217,217,2021-02-20T14:40:47Z,1190.37,,141.8,2.1,7.3,0,170.7,183.0,5.485576
4,20598228,4818094466,10944546,Col du chateau d'eau court,Ride,151,151,2021-02-20T14:42:02Z,745.4,,165.4,2.8,5.5,0,177.5,183.0,4.936424


In [7]:
print(activities_df.shape)
print(segments_df.shape)

(214, 17)
(2020, 17)


In [8]:
activities_df.dtypes

id                        int64
athlete_id                int64
name                     object
distance                float64
moving_time               int64
elapsed_time              int64
total_elevation_gain    float64
type                     object
start_date_local         object
average_speed           float64
average_cadence         float64
average_watts           float64
max_watts               float64
suffer_score            float64
calories                float64
average_heart_rate      float64
max_heart_rate          float64
dtype: object

In [9]:
segments_df.dtypes

id                      int64
activity_id             int64
athlete_id              int64
name                   object
type                   object
elapsed_time            int64
moving_time             int64
start_date_local       object
distance              float64
average_cadence       float64
average_watts         float64
average_grade         float64
maximum_grade         float64
climb_category          int64
average_heart_rate    float64
max_heart_rate        float64
average_speed         float64
dtype: object

## Naive Algo

Algo, which predicts travel time over a given distance based on the average speed of all segments of the train.

In [10]:
columns = ['elapsed_time','distance','average_speed']
data = segments_df[columns]
data.head(5)

Unnamed: 0,elapsed_time,distance,average_speed
0,653,6623.1,10.142573
1,424,1882.5,4.439858
2,789,5100.5,6.464512
3,217,1190.37,5.485576
4,151,745.4,4.936424


In [11]:
train_set , test_set = train_test_split(data, test_size=0.1, random_state=42)

y_train = train_set["elapsed_time"]
X_train = train_set.drop("elapsed_time", axis=1)
y_test  = test_set["elapsed_time"]
X_test = test_set.drop("elapsed_time", axis=1)


print(y_train.shape)
print(X_train.shape)
print(y_test.shape)
print(X_test.shape)

(1818,)
(1818, 2)
(202,)
(202, 2)


### Calculation of the average speed of the train segments

In [12]:
mean_speed_train = X_train['average_speed'].mean()
mean_speed_train

5.940770310610693

### Prédiction 

In [13]:
X_test['elapsed_time_pred'] = X_test['distance'] / mean_speed_train
y_pred = X_test['elapsed_time_pred'].values

In [14]:
mae_naive = mean_absolute_error(y_test,y_pred)
mape_naive = mean_absolute_percentage_error(y_test, y_pred)
rmse_naive = np.sqrt(mean_squared_error(y_test,y_pred))
print(f'naive_mae : {mae_naive}')
print(f'naive_mape: {mape_naive}')
print(f'naive_rmse: {rmse_naive}')

naive_mae : 202.2597019674092
naive_mape: 0.37233910966761735
naive_rmse: 464.70500544327587


### Prediction of the naive algorithm on the alpe d'huez

In [15]:
query = {
    "query": {
        "match": {
            "_id": 2787335981548134218
        }
    }
}

result = database.search(
    index="index_route",
    size=1000,
    body= query
)

route_alpe = result['hits']['hits'][0]['_source']
segments_alpe = route_alpe.get("segmentation")
segments_alpe_df = pd.DataFrame(segments_alpe)
segments_alpe_df

Unnamed: 0,distance,altitude_gain,average_grade,all_points
0,270.55,-0.8,-0.3,"[[45.05476, 6.031770000000001], [45.0547500000..."
1,422.22,4.33,1.03,"[[45.056370001328915, 6.034199995890939], [45...."
2,427.89,-5.51,-1.29,"[[45.059050000000006, 6.0379000000000005], [45..."
3,12615.51,1063.27,8.43,"[[45.062720000000006, 6.03701], [45.0628500000..."


In [16]:
segments_alpe_df['elapsed_time_pred'] = segments_alpe_df['distance'] / mean_speed_train
segments_alpe_df

Unnamed: 0,distance,altitude_gain,average_grade,all_points,elapsed_time_pred
0,270.55,-0.8,-0.3,"[[45.05476, 6.031770000000001], [45.0547500000...",45.541232
1,422.22,4.33,1.03,"[[45.056370001328915, 6.034199995890939], [45....",71.071591
2,427.89,-5.51,-1.29,"[[45.059050000000006, 6.0379000000000005], [45...",72.026013
3,12615.51,1063.27,8.43,"[[45.062720000000006, 6.03701], [45.0628500000...",2123.547847


In [17]:
segments_alpe_df['elapsed_time_pred'].sum()

2312.186683175765

In [18]:
mean_speed_train*3.6

21.386773118198498

In [19]:
elapsed_time_naive = relativedelta(seconds=segments_alpe_df['elapsed_time_pred'].sum())

In [20]:
hour = elapsed_time_naive.hours
minutes = int(elapsed_time_naive.minutes)
seconds = int(elapsed_time_naive.seconds)
print(f'Prediction {hour}h{minutes}m{seconds}sec')

Prediction 0h38m32sec


In [22]:
elapsed_time_pred_naive_sec = segments_alpe_df['elapsed_time_pred'].sum()
elapsed_time_pred_naive_sec

2312.186683175765

### The prediction of our naive algorithm looks just as wrong as the prediction we got from Strava (31:39)... The goal will be to train a model making more realistic predictions.