In [1]:
%%capture
%run naive_algorithm.ipynb
%run features_engineering.ipynb

In [2]:
import random 
import seaborn as sns
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta

import xgboost as xgb
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error


from sklearn.preprocessing import StandardScaler,MinMaxScaler

## Split Train/Test

Probleme de segments dans lequel sont compris d'autres segments , pour eviter de la fuite de données dans le cas ou
segment se retrouverais dans le train et une partie de celui ci dans le train : 

* Calcul de jour calendaire/année de chaque segment
* Selection au hasard de certaines dates qui constitueront le test_set , tout le reste dans le train_set
* De cette maniere aucun segment réalisé a une meme date ne pourra se retrouver a la fois dans le train et le test


In [3]:
segments_df.head(5)

Unnamed: 0,id,activity_id,athlete_id,name,elapsed_time,moving_time,distance,average_cadence,average_watts,average_grade,...,climb_category,average_heart_rate,max_heart_rate,start_time,start_date,time_activities_last_30d,type_virtualride,days_since_last_activity,average_climbcat_last_30d,average_speed_last_30d
0,24674235,4831255939,10944546,Champs-Élysées,653,653,6623.1,82.3,202.8,0.0,...,0,161.2,170.0,18:46:25,2021-02-22,315.25,1,0,0.06,7.76
1,27141377,4831057316,10944546,Epic KOM - Start to Bypass Intersection (Zwift...,424,424,1882.5,77.3,201.2,3.8,...,0,157.9,164.0,18:19:40,2021-02-22,315.25,1,2,0.06,7.76
2,22501195,4818094466,10944546,Villenave-Technopole,789,789,5100.5,,108.0,0.5,...,0,169.0,184.0,14:39:32,2021-02-20,262.78,0,3,0.15,7.69
3,3566434,4818094466,10944546,Col du Chateau d'eau,217,217,1190.37,,141.8,2.1,...,0,170.7,183.0,14:40:47,2021-02-20,262.78,0,3,0.15,7.69
4,20598228,4818094466,10944546,Col du chateau d'eau court,151,151,745.4,,165.4,2.8,...,0,177.5,183.0,14:42:02,2021-02-20,262.78,0,3,0.15,7.69


In [4]:
calendar_days = []
for date in segments_df['start_date']:
    calendar_day = date.strftime('%j%Y')
    calendar_days.append(calendar_day)

segments_df['calendar_day'] = calendar_days

In [5]:
dates = segments_df['calendar_day'].unique()

In [6]:
dates = dates.tolist()

In [7]:
len(dates)

200

In [8]:
ratio_train_test = len(dates) * 0.2
ratio_train_test

40.0

In [9]:
random.seed(42)
dates_test_set = random.sample(dates,int(ratio_train_test))

In [10]:
test_set = segments_df[segments_df['calendar_day'].isin(dates_test_set)]
test_set.shape

(379, 22)

In [None]:
test_set_index = test_set.index
test_set_index

In [None]:
train_set = segments_df.drop(test_set_index)
train_set

In [None]:
1561/1940

## Model

In [None]:
columns = ['elapsed_time','distance','average_grade','climb_category','time_activities_last_30d','type_virtualride','days_since_last_activity','average_speed_last_30d','average_climbcat_last_30d']
corrmat = train_set[columns].corr()
corrmat

In [None]:
columns = ['elapsed_time','distance','climb_category','type_virtualride','time_activities_last_30d','days_since_last_activity','average_climbcat_last_30d']
# columns = ['elapsed_time','distance']
train_set = train_set[columns]
test_set = test_set[columns]

In [None]:
y_train = train_set["elapsed_time"]
X_train = train_set.drop("elapsed_time", axis=1)
y_test  = test_set["elapsed_time"]
X_test = test_set.drop("elapsed_time", axis=1)


print(y_train.shape)
print(X_train.shape)
print(y_test.shape)
print(X_test.shape)

In [None]:
X_train.head(5)

In [None]:
sns.displot(y_train)

In [None]:
sns.displot(np.log(y_train))

In [None]:
y_train_log = np.log(y_train)

## Scaler ##

In [None]:
# min_max = MinMaxScaler()

In [None]:
# min_max.fit(X_train)

In [None]:
# X_train_scaled = min_max.transform(X_train)
# X_test_scaled = min_max.transform(X_test)

#### Algos

In [None]:
reg = LinearRegression()
reg.fit(X_train,y_train_log)
y_pred_reg_log = reg.predict(X_test)

In [None]:
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train,y_train_log)
y_pred_xgb_log = xgb_reg.predict(X_test)

In [None]:
forrest_reg = RandomForestRegressor()
forrest_reg.fit(X_train,y_train_log)
y_pred_forrest_log = forrest_reg.predict(X_test)

In [None]:
mae_reg = mean_absolute_error(y_test,np.exp(y_pred_reg_log))
mape_reg = mean_absolute_percentage_error(y_test, np.exp(y_pred_reg_log))
rmse_reg  = np.sqrt(mean_squared_error(y_test,np.exp(y_pred_reg_log)))
print(f'mae_reg : {mae_reg}')
print(f'mape_reg : {mape_reg}')
print(f'rmse_reg: {rmse_reg}')

In [None]:
mae_xgb = mean_absolute_error(y_test,np.exp(y_pred_xgb_log))
mape_xgb = mean_absolute_percentage_error(y_test, np.exp(y_pred_xgb_log))
rmse_xgb  = np.sqrt(mean_squared_error(y_test,np.exp(y_pred_xgb_log)))
print(f'mae_xgb : {mae_xgb}')
print(f'mape_xgb: {mape_xgb}')
print(f'rmse_xgb: {rmse_xgb}')

In [None]:
mae_forrest = mean_absolute_error(y_test,np.exp(y_pred_forrest_log))
mape_forrest = mean_absolute_percentage_error(y_test, np.exp(y_pred_forrest_log))
rmse_forrest  = np.sqrt(mean_squared_error(y_test,np.exp(y_pred_forrest_log)))
print(f'mae_forrest : {mae_forrest}')
print(f'mape_forrest: {mape_forrest}')
print(f'rmse_forrest: {rmse_forrest}')

In [None]:
xgb.plot_importance(xgb_reg)
plt.rcParams['figure.figsize'] = [15, 5]
plt.show()

In [None]:
data = {
    'MAE':[mae_naive,mae_reg,mae_xgb,mae_forrest],
    'MAPE':[mape_naive,mape_reg,mape_xgb,mape_forrest],
    'RMSE':[rmse_naive,rmse_reg,rmse_xgb,rmse_forrest]
}

comparaison_df = pd.DataFrame(data, index=['naive','linreg','xgbreg','forrest'])
comparaison_df.sort_values(by=['MAPE'])

### Prediction Alpe d'huez

In [None]:
query = {
    "query": {
        "match": {
            "_id": 2787335981548134218
        }
    }
}

result = database.search(
    index="index_route",
    size=1000,
    body= query
)

route_alpe = result['hits']['hits'][0]['_source']
segments_alpe = route_alpe.get("segmentation")
segments_alpe_df = pd.DataFrame(segments_alpe)
segments_alpe_df

**Time Activities last 30d and Avg Speed**

In [None]:
today = date.today()
end_date = today - timedelta(days=1)
start_date = end_date - timedelta(days=30)

In [None]:
activities_last_30d = activities_df[activities_df['start_date'].between(start_date,end_date)]
time_activities_last_30d = round((activities_last_30d['elapsed_time'].sum()/60),2)

result = activities_last_30d['average_speed'].mean()
if np.isnan(result):
    average_speed_last_30d = 0
else:
    average_speed_last_30d = result

In [None]:
segments_alpe_df['time_activities_last_30d'] = time_activities_last_30d
segments_alpe_df['average_speed_last_30d'] = average_speed_last_30d
segments_alpe_df

**Home Trainer**

In [None]:
segments_alpe_df['type_virtualride'] = 1
segments_alpe_df

**Climb Category**

In [None]:
climb_category = []
for average_grade, distance in zip(segments_alpe_df['average_grade'],segments_alpe_df['distance']):
    result = average_grade * distance
    if result <= 8000:
        climb_cat = 0
    elif result > 8000 and result <= 16000:
        climb_cat = 4
    elif result > 16000 and result <= 32000:
        climb_cat = 3
    elif result > 32000 and result <= 64000:
        climb_cat = 2
    elif result > 64000 and result <= 80000:
        climb_cat = 1
    elif result > 80000:
        climb_cat = 5
    climb_category.append(climb_cat)
climb_category

In [None]:
segments_alpe_df['climb_category'] = climb_category
segments_alpe_df

**Days since last_activities**

In [None]:
result = date.today() - activities_df.loc[0]['start_date']
segments_alpe_df['days_since_last_activity'] = result.days
segments_alpe_df

**AVg climb Last30D**

In [None]:
segments_alpe_df['average_climbcat_last_30d'] = round(segments_df[segments_df['start_date'].between(start_date,end_date)]['climb_category'].mean(),2)

**Prediction**

In [None]:
cols_to_predict = ['distance','climb_category','type_virtualride','time_activities_last_30d','days_since_last_activity','average_climbcat_last_30d']
datas = segments_alpe_df[cols_to_predict]
datas

In [None]:
alpe_pred_reg = reg.predict(datas)
elapsed_time_reg = relativedelta(seconds=(np.exp(alpe_pred_reg)).sum())
elapsed_time_reg

In [None]:
alpe_pred_xgb = xgb_reg.predict(datas)
elapsed_time_xgb = relativedelta(seconds=(np.exp(alpe_pred_xgb)).sum())
elapsed_time_xgb

In [None]:
alpe_pred_forrest = forrest_reg.predict(datas)
elapsed_time_forrest = relativedelta(seconds=(np.exp(alpe_pred_forrest)).sum())
elapsed_time_forrest

In [None]:
elapsed_time_naive

In [None]:
def format_hms_relativedelta(relative_delta):
    hour = int(relative_delta.hours)
    minutes = int(relative_delta.minutes)
    seconds = int(relative_delta.seconds)
    print(f'Prediction {hour}h{minutes}m{seconds}sec')
    return f'{hour}h{minutes}m{seconds}sec'

In [None]:
def compute_kmh(time,distance):
    return (distance/time)*3.6

In [None]:
def compute_pred_seconds(pred):
    return round(np.exp(pred).sum(),2)

In [None]:
distance = segments_alpe_df['distance'].sum()
distance

In [None]:
data = {
    'Prediction':[
                  format_hms_relativedelta(elapsed_time_naive),
                  format_hms_relativedelta(elapsed_time_reg),
                  format_hms_relativedelta(elapsed_time_xgb),
                  format_hms_relativedelta(elapsed_time_forrest),
                  '0h31m39s'
                 ],
    'Mean_speed(km/h)':[
                        distance / elapsed_time_pred_naive_sec * 3.6,
                        compute_kmh(compute_pred_seconds(alpe_pred_reg),distance),
                        compute_kmh(compute_pred_seconds(alpe_pred_xgb),distance),
                        compute_kmh(compute_pred_seconds(alpe_pred_forrest),distance),
                        distance / 1899 * 3.6
                        ]
}

comparaison_pred_df = pd.DataFrame(data, index=['naive','linreg','xgbreg','forrest','strava'])
comparaison_pred_df