In [1]:
import elasticsearch
import pandas as pd
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

### The goal is to make an algorithm that will serve as a baseline . The performance of the future models will have to be superior to this one.

In [2]:
hosts = [{"host": 'localhost', "port": 9200}]
database = elasticsearch.Elasticsearch(hosts=hosts)

In [3]:
def get_datas(database):
    query = {
        "query": {
            "match_all": {}
        }
    }

    result = database.search(
        index="index_activity",
        size=1000,
        body= query
    )
    
    
    activities = result['hits']['hits']
    activities_json = [ activity.get("_source") for activity in activities]
    activities_df = pd.DataFrame(activities_json)
    activities_df = activities_df.drop(columns=['py/object', 'segment_efforts'])
    
    segments_json = [ segment for activity in activities_json for segment in activity.get("segment_efforts")]
    segments_df = pd.DataFrame(segments_json)
    segments_df = segments_df.drop(columns=['py/object'])
    
    return activities_df, segments_df

activities_df, segments_df = get_datas(database)

### Addition of the average speed for each of the segments

In [4]:
segments_df['average_speed'] = segments_df['distance'] / segments_df['elapsed_time']

In [5]:
activities_df.head(5)

Unnamed: 0,id,athlete_id,name,distance,moving_time,elapsed_time,total_elevation_gain,type,start_date_local,average_speed,average_cadence,average_watts,max_watts,suffer_score,calories,average_heart_rate,max_heart_rate
0,4801436758,10944546,Yorkshire,23068.3,2931,2931,435.0,VirtualRide,2021-02-17T13:32:49Z,7.87,77.3,179.8,440.0,112.0,502.0,162.9,193.0
1,4776486085,10944546,NYC,20694.8,2740,2740,377.0,VirtualRide,2021-02-12T18:36:15Z,7.553,77.6,171.0,335.0,128.0,447.0,169.0,194.0
2,4745819775,10944546,Richmond,21435.5,2456,2456,144.0,VirtualRide,2021-02-06T18:12:33Z,8.728,79.4,190.9,916.0,128.0,448.0,171.8,199.0
3,4713272644,10944546,Richmond,16823.5,1869,1869,136.0,VirtualRide,2021-01-31T18:41:40Z,9.001,79.0,179.5,829.0,85.0,320.0,168.2,196.0
4,4759444106,10944546,Innsbruck,17570.4,2835,2835,428.0,VirtualRide,2021-02-09T12:27:56Z,6.198,72.2,169.1,318.0,134.0,457.0,168.8,187.0


In [6]:
segments_df.head(5)

Unnamed: 0,id,activity_id,athlete_id,name,type,elapsed_time,moving_time,start_date_local,distance,average_cadence,average_watts,average_grade,maximum_grade,climb_category,average_heart_rate,max_heart_rate,average_speed
0,27180136,4801436758,10944546,Kent Rd Climb to Sprint Banner (Zwift Insider ...,VirtualRide,176,176,2021-02-17T13:36:00Z,1137.3,81.4,183.4,2.6,7.0,0,149.4,156.0,6.461932
1,21748007,4801436758,10944546,Yorkshire Sprint Reverse,VirtualRide,63,63,2021-02-17T13:37:53Z,394.3,79.9,182.9,2.1,5.7,0,153.4,156.0,6.25873
2,27207377,4801436758,10944546,Finishing Straightaway (Zwift Insider verified),VirtualRide,99,99,2021-02-17T13:40:47Z,611.1,78.4,192.3,3.3,10.2,0,156.7,162.0,6.172727
3,27180001,4801436758,10944546,Otley Rd Climb (Zwift Insider verified),VirtualRide,285,285,2021-02-17T13:43:02Z,1641.6,75.5,204.1,3.4,6.6,0,164.7,172.0,5.76
4,20837686,4801436758,10944546,Beckwithshaw to Penny Pot - Fulgaz,VirtualRide,256,256,2021-02-17T13:51:25Z,2323.4,75.5,189.3,-1.1,10.9,0,167.3,177.0,9.075781


In [7]:
print(activities_df.shape)
print(segments_df.shape)

(211, 17)
(1992, 17)


In [8]:
activities_df.dtypes

id                        int64
athlete_id                int64
name                     object
distance                float64
moving_time               int64
elapsed_time              int64
total_elevation_gain    float64
type                     object
start_date_local         object
average_speed           float64
average_cadence         float64
average_watts           float64
max_watts               float64
suffer_score            float64
calories                float64
average_heart_rate      float64
max_heart_rate          float64
dtype: object

In [9]:
segments_df.dtypes

id                      int64
activity_id             int64
athlete_id              int64
name                   object
type                   object
elapsed_time            int64
moving_time             int64
start_date_local       object
distance              float64
average_cadence       float64
average_watts         float64
average_grade         float64
maximum_grade         float64
climb_category          int64
average_heart_rate    float64
max_heart_rate        float64
average_speed         float64
dtype: object

## Naive Algo

Algo, which predicts travel time over a given distance based on the average speed of all segments of the train.

In [10]:
columns = ['elapsed_time','distance','average_speed']
data = segments_df[columns]
data.head(5)

Unnamed: 0,elapsed_time,distance,average_speed
0,176,1137.3,6.461932
1,63,394.3,6.25873
2,99,611.1,6.172727
3,285,1641.6,5.76
4,256,2323.4,9.075781


In [11]:
train_set , test_set = train_test_split(data, test_size=0.1, random_state=42)

y_train = train_set["elapsed_time"]
X_train = train_set.drop("elapsed_time", axis=1)
y_test  = test_set["elapsed_time"]
X_test = test_set.drop("elapsed_time", axis=1)


print(y_train.shape)
print(X_train.shape)
print(y_test.shape)
print(X_test.shape)

(1792,)
(1792, 2)
(200,)
(200, 2)


### Calculation of the average speed of the train segments

In [12]:
mean_speed_train = X_train['average_speed'].mean()
mean_speed_train

5.941024411590056

### Prédiction 

In [13]:
X_test['elapsed_time_pred'] = X_test['distance'] / mean_speed_train
y_pred = X_test['elapsed_time_pred'].values

In [14]:
naive_mae = mean_absolute_error(y_test,y_pred)
naive_mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'naive_mae : {naive_mae}')
print(f'naive_mape: {naive_mape}')

naive_mae : 257.7311365999788
naive_mape: 0.3743327293974932


### Prediction of the naive algorithm on the alpe d'huez

In [19]:
query = {
    "query": {
        "match": {
            "_id": 2787335981548134218
        }
    }
}

result = database.search(
    index="index_route",
    size=1000,
    body= query
)

route_alpe = result['hits']['hits'][0]['_source']
segments_alpe = route_alpe.get("segmentation")
segments_alpe_df = pd.DataFrame(segments_alpe)
segments_alpe_df

Unnamed: 0,distance,altitude_gain,vertical_drop,all_points
0,270.55,-0.8,-0.3,"[[45.05476, 6.031770000000001], [45.0547500000..."
1,422.22,4.33,1.03,"[[45.056370001328915, 6.034199995890939], [45...."
2,427.89,-5.51,-1.29,"[[45.059050000000006, 6.0379000000000005], [45..."
3,12615.51,1063.27,8.43,"[[45.062720000000006, 6.03701], [45.0628500000..."


In [20]:
segments_alpe_df['elapsed_time_pred'] = segments_alpe_df['distance'] / mean_speed_train
segments_alpe_df

Unnamed: 0,distance,altitude_gain,vertical_drop,all_points,elapsed_time_pred
0,270.55,-0.8,-0.3,"[[45.05476, 6.031770000000001], [45.0547500000...",45.539284
1,422.22,4.33,1.03,"[[45.056370001328915, 6.034199995890939], [45....",71.068552
2,427.89,-5.51,-1.29,"[[45.059050000000006, 6.0379000000000005], [45...",72.022932
3,12615.51,1063.27,8.43,"[[45.062720000000006, 6.03701], [45.0628500000...",2123.457021


In [21]:
segments_alpe_df['elapsed_time_pred'].sum()

2312.087789641593

In [22]:
mean_speed_train*3.6

21.3876878817242

In [23]:
elapsed_time = relativedelta(seconds=segments_alpe_df['elapsed_time_pred'].sum())

In [24]:
hour = elapsed_time.hours
minutes = int(elapsed_time.minutes)
seconds = int(elapsed_time.seconds)
print(f'Prediction {hour}h{minutes}m{seconds}sec')

Prediction 0h38m32sec


### The prediction of our naive algorithm looks just as wrong as the prediction we got from Strava... The goal will be to train a model making more realistic predictions.

![alpe](https://github.com/vlagache/cycling_travel_time/blob/model/notebooks_explanations/datas/images/alpe_pred.PNG)