In [1]:
import elasticsearch
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

### The goal is to make an algorithm that will serve as a baseline . The performance of the future models will have to be superior to this one.

In [2]:
hosts = [{"host": 'localhost', "port": 9200}]
database = elasticsearch.Elasticsearch(hosts=hosts)

In [3]:
def get_datas(database):
    query = {
        "query": {
            "match_all": {}
        }
    }

    result = database.search(
        index="index_activity",
        size=1000,
        body= query
    )
    
    
    activities = result['hits']['hits']
    activities_json = [ activity.get("_source") for activity in activities]
    activities_df = pd.DataFrame(activities_json)
    activities_df = activities_df.drop(columns=['py/object', 'segment_efforts'])
    
    segments_json = [ segment for activity in activities_json for segment in activity.get("segment_efforts")]
    segments_df = pd.DataFrame(segments_json)
    segments_df = segments_df.drop(columns=['py/object'])
    
    return activities_df, segments_df

activities_df, segments_df = get_datas(database)

### Addition of the average speed for each of the segments

In [4]:
segments_df['average_speed'] = segments_df['distance'] / segments_df['elapsed_time']

In [5]:
activities_df.head(5)

Unnamed: 0,id,athlete_id,name,distance,moving_time,elapsed_time,total_elevation_gain,type,start_date_local,average_speed,average_cadence,average_watts,max_watts,suffer_score,calories,average_heart_rate,max_heart_rate
0,4637673624,10944546,Watopia,20920.4,2463,2463,199.0,VirtualRide,2021-01-17T18:05:42Z,8.494,75.4,169.7,614.0,82.0,399.0,157.9,189.0
1,4513379684,10944546,Villes-sur-Auzon - Monieux,22359.2,3353,3409,442.0,VirtualRide,2020-12-24T18:16:53Z,6.668,80.5,184.9,382.0,143.0,682.0,166.2,184.0
2,4801436758,10944546,Yorkshire,23068.3,2931,2931,435.0,VirtualRide,2021-02-17T13:32:49Z,7.87,77.3,179.8,440.0,112.0,502.0,162.9,193.0
3,4745819775,10944546,Richmond,21435.5,2456,2456,144.0,VirtualRide,2021-02-06T18:12:33Z,8.728,79.4,190.9,916.0,128.0,448.0,171.8,199.0
4,4713272644,10944546,Richmond,16823.5,1869,1869,136.0,VirtualRide,2021-01-31T18:41:40Z,9.001,79.0,179.5,829.0,85.0,320.0,168.2,196.0


In [6]:
segments_df.head(5)

Unnamed: 0,id,activity_id,athlete_id,name,type,elapsed_time,moving_time,start_date_local,distance,average_cadence,average_watts,average_grade,maximum_grade,climb_category,average_heart_rate,max_heart_rate,average_speed
0,14270131,4637673624,10944546,Volcano KOM,VirtualRide,654,654,2021-01-17T18:28:18Z,3753.7,76.8,209.3,3.2,11.3,1,177.9,189.0,5.739602
1,19970445,4637673624,10944546,Volcano Descent,VirtualRide,281,281,2021-01-17T18:40:17Z,3507.7,64.5,111.1,-3.4,5.9,0,151.6,170.0,12.482918
2,22513982,4513379684,10944546,Full Ride,VirtualRide,3407,3351,2020-12-24T18:16:53Z,22343.3,80.2,183.4,1.6,7.5,0,165.2,184.0,6.558057
3,23240825,4513379684,10944546,"First 3,5 km",VirtualRide,763,707,2020-12-24T18:16:53Z,3664.3,78.2,179.8,3.3,5.0,1,151.0,171.0,4.80249
4,22745156,4513379684,10944546,rouby - les gorges de la nesque,VirtualRide,3183,3127,2020-12-24T18:16:53Z,19504.8,80.1,186.8,2.2,7.8,0,165.5,184.0,6.127804


In [7]:
print(activities_df.shape)
print(segments_df.shape)

(215, 17)
(2027, 17)


In [8]:
activities_df.dtypes

id                        int64
athlete_id                int64
name                     object
distance                float64
moving_time               int64
elapsed_time              int64
total_elevation_gain    float64
type                     object
start_date_local         object
average_speed           float64
average_cadence         float64
average_watts           float64
max_watts               float64
suffer_score            float64
calories                float64
average_heart_rate      float64
max_heart_rate          float64
dtype: object

In [9]:
segments_df.dtypes

id                      int64
activity_id             int64
athlete_id              int64
name                   object
type                   object
elapsed_time            int64
moving_time             int64
start_date_local       object
distance              float64
average_cadence       float64
average_watts         float64
average_grade         float64
maximum_grade         float64
climb_category          int64
average_heart_rate    float64
max_heart_rate        float64
average_speed         float64
dtype: object

## Naive Algo

Algo, which predicts travel time over a given distance based on the average speed of all segments of the train.

In [10]:
columns = ['elapsed_time','distance','average_speed']
data = segments_df[columns]
data.head(5)

Unnamed: 0,elapsed_time,distance,average_speed
0,654,3753.7,5.739602
1,281,3507.7,12.482918
2,3407,22343.3,6.558057
3,763,3664.3,4.80249
4,3183,19504.8,6.127804


In [11]:
train_set , test_set = train_test_split(data, test_size=0.1, random_state=42)

y_train = train_set["elapsed_time"]
X_train = train_set.drop("elapsed_time", axis=1)
y_test  = test_set["elapsed_time"]
X_test = test_set.drop("elapsed_time", axis=1)


print(y_train.shape)
print(X_train.shape)
print(y_test.shape)
print(X_test.shape)

(1824,)
(1824, 2)
(203,)
(203, 2)


### Calculation of the average speed of the train segments

In [12]:
mean_speed_train = X_train['average_speed'].mean()
mean_speed_train

5.945320881522815

### Prédiction 

In [13]:
X_test['elapsed_time_pred'] = X_test['distance'] / mean_speed_train
y_pred = X_test['elapsed_time_pred'].values

In [14]:
mae_naive = mean_absolute_error(y_test,y_pred)
mape_naive = mean_absolute_percentage_error(y_test, y_pred)
rmse_naive = np.sqrt(mean_squared_error(y_test,y_pred))
print(f'naive_mae : {mae_naive}')
print(f'naive_mape: {mape_naive}')
print(f'naive_rmse: {rmse_naive}')

naive_mae : 258.3308683024614
naive_mape: 0.36804839590606236
naive_rmse: 559.4760824114861


### Prediction of the naive algorithm on the alpe d'huez

In [15]:
query = {
    "query": {
        "match": {
            "_id": 2787335981548134218
        }
    }
}

result = database.search(
    index="index_route",
    size=1000,
    body= query
)

route_alpe = result['hits']['hits'][0]['_source']
segments_alpe = route_alpe.get("segmentation")
segments_alpe_df = pd.DataFrame(segments_alpe)
segments_alpe_df

Unnamed: 0,distance,altitude_gain,average_grade,all_points
0,270.55,-0.8,-0.3,"[[45.05476, 6.031770000000001], [45.0547500000..."
1,422.22,4.33,1.03,"[[45.056370001328915, 6.034199995890939], [45...."
2,427.89,-5.51,-1.29,"[[45.059050000000006, 6.0379000000000005], [45..."
3,12615.51,1063.27,8.43,"[[45.062720000000006, 6.03701], [45.0628500000..."


In [16]:
segments_alpe_df['elapsed_time_pred'] = segments_alpe_df['distance'] / mean_speed_train
segments_alpe_df

Unnamed: 0,distance,altitude_gain,average_grade,all_points,elapsed_time_pred
0,270.55,-0.8,-0.3,"[[45.05476, 6.031770000000001], [45.0547500000...",45.506375
1,422.22,4.33,1.03,"[[45.056370001328915, 6.034199995890939], [45....",71.017193
2,427.89,-5.51,-1.29,"[[45.059050000000006, 6.0379000000000005], [45...",71.970884
3,12615.51,1063.27,8.43,"[[45.062720000000006, 6.03701], [45.0628500000...",2121.922475


In [17]:
segments_alpe_df['elapsed_time_pred'].sum()

2310.4169268121427

In [18]:
mean_speed_train*3.6

21.403155173482137

In [19]:
elapsed_time_naive = relativedelta(seconds=segments_alpe_df['elapsed_time_pred'].sum())

In [20]:
hour = elapsed_time_naive.hours
minutes = int(elapsed_time_naive.minutes)
seconds = int(elapsed_time_naive.seconds)
print(f'Prediction {hour}h{minutes}m{seconds}sec')

Prediction 0h38m30sec


In [21]:
elapsed_time_pred_naive_sec = segments_alpe_df['elapsed_time_pred'].sum()
elapsed_time_pred_naive_sec

2310.4169268121427

### The prediction of our naive algorithm looks just as wrong as the prediction we got from Strava (31:39)... The goal will be to train a model making more realistic predictions.