# regression for speed
Author: Bujie Xu

speed = Distance/measured time

In [419]:
import numpy as np
import pandas as pd
import matplotlib
import sklearn
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr

%config InlineBackend.figure_format = 'png' 
%matplotlib inline

In [420]:
# learning curver
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, scoring="neg_mean_squared_error",cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [421]:
# validation curve
from sklearn.model_selection import validation_curve

def plot_validation_curve(estimator, title, X, y,cv=None, 
                          n_jobs = 1,param_range=np.logspace(-6, -1, 100)):
    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name="alpha", param_range=param_range,
        cv=cv, scoring="neg_mean_squared_error", n_jobs = n_jobs )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title(title)
    plt.xlabel("alpha")
    plt.ylabel("Score")
    lw = 2
    plt.semilogx(param_range, train_scores_mean, label="Training score",
                 color="darkorange", lw=lw)
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=lw)
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                 color="navy", lw=lw)
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=lw)
    plt.legend(loc="best")
    return plt

In [513]:
# load data
train_csv = pd.read_csv(filepath_or_buffer="../data/data_train.csv",sep=";")
test_csv = pd.read_csv(filepath_or_buffer="../data/data_test.csv",sep=";")

### Speed: predict result

In [428]:
speed_test = test_csv.loc[test_csv.Speed.isnull()]
speed_test.Speed = speed_test.Distance/speed_test.Measured_time

In [429]:
speed_result = speed_test[['Id','Speed']]

In [430]:
speed_result.head()

Unnamed: 0,Id,Speed
0,1,10.134
1,2,6.063448
2,3,12.037037
3,4,5.485714
4,5,6.646154


## Part2 :Regression for Candence

## Data Preprocessing (Cadence)

In [514]:
all_data = pd.concat((train_csv, test_csv))
all_data.Speed = all_data.Speed.fillna(all_data.Distance/all_data.Measured_time)

In [515]:
all_data.Date = pd.to_datetime(all_data.Date)
all_data.Time = pd.to_datetime(all_data.Time)
all_data['Year'] = all_data.Date.apply(lambda date:date.year)
all_data['Month'] = all_data.Date.apply(lambda date:date.month)
all_data['Hour'] = all_data.Time.apply(lambda time:time.hour)

In [576]:
# remove useless columns
cadence = all_data.drop(['Index','Date','Time'], axis=1)

In [577]:
# change categorical data to dummys
cadence.RiderID = cadence.RiderID.astype('category')
cadence.Month = cadence.Month.astype('category')
cadence.Hour = cadence.Hour.astype('category')
cadence.Year = cadence.Year.astype('category')
cadence = pd.get_dummies(cadence)

In [578]:
# add new feature speed/dis
cadence['1/Measured_time'] = 1/cadence.Measured_time

In [579]:
cadence.columns

Index([u'Average_Gradient', u'Average_heart_rate', u'Cadence', u'Distance',
       u'Highest_point', u'Id', u'Lowest_point', u'Max_Gradient',
       u'Max_heart_rate', u'Measured_time', u'Moving_time', u'Power', u'Speed',
       u'RiderID_1', u'RiderID_2', u'RiderID_3', u'RiderID_4', u'RiderID_5',
       u'RiderID_6', u'RiderID_7', u'RiderID_8', u'RiderID_9', u'RiderID_10',
       u'RiderID_11', u'RiderID_12', u'RiderID_13', u'RiderID_14',
       u'RiderID_15', u'Year_2008', u'Year_2009', u'Year_2010', u'Year_2011',
       u'Year_2012', u'Year_2013', u'Year_2014', u'Year_2015', u'Year_2016',
       u'Month_1', u'Month_2', u'Month_3', u'Month_4', u'Month_5', u'Month_6',
       u'Month_7', u'Month_8', u'Month_9', u'Month_10', u'Month_11',
       u'Month_12', u'Hour_0', u'Hour_1', u'Hour_2', u'Hour_3', u'Hour_4',
       u'Hour_5', u'Hour_6', u'Hour_7', u'Hour_8', u'Hour_9', u'Hour_10',
       u'Hour_11', u'Hour_12', u'Hour_13', u'Hour_14', u'Hour_15', u'Hour_16',
       u'Hour_17', u'Hour

In [580]:
cadence_train = cadence.dropna(subset=['Cadence','Power'],how = 'any')
cadence_test = cadence.loc[cadence['Cadence'].isnull()]

In [581]:
dropList = ['Cadence','Id']
cadence_X_train, cadence_y = cadence_train.drop(dropList, axis=1), cadence_train['Cadence']
cadence_columns = cadence_X_train.columns
cadence_X_test = cadence_test.drop(dropList, axis=1)

In [582]:
cadence_X_train.shape

(120000, 72)

In [307]:
# try log transform the target
# numeric_feats = cadence_train.dtypes[cadence_train.dtypes != "object"].index

#skewed_feats = cadence_train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
#skewed_feats = skewed_feats[skewed_feats > 1]
#skewed_feats = skewed_feats.index
#skewed_feats = np.hstack((skewed_feats[1],skewed_feats[-1]))
#cadence_train[skewed_feats] = np.log1p(cadence_train[skewed_feats])
#cadence_train[['Distance','/Dist']] = np.log1p(cadence_train[['Distance','1/Dist']])

In [472]:
# from sklearn.preprocessing import MinMaxScaler
# min_max = MinMaxScaler()
# X_train = min_max.fit_transform(X_train)
# X_test = min_max.transform(X_test)

In [406]:
# from sklearn.preprocessing import StandardScaler
# x_scaler = StandardScaler()
# X_train = x_scaler.fit_transform(X_)

In [407]:
# X_train = np.hstack((X_train, riderID))

## Cadence: Models 

In [408]:
# from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, Lasso
# from sklearn.model_selection import cross_val_score

# def rmse_cv(model):
#     rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring = "neg_mean_squared_error", cv = 5))
#     return(rmse)

## RandomForestRegressor

In [583]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rfr = RandomForestRegressor()
param_grid = {'n_estimators': [500,1000], 'max_features': [0.2, 9, 12], 'min_samples_leaf':[50]}
cadence_model_randomForestRegressor = GridSearchCV(estimator=rfr, param_grid=param_grid, 
                                                   n_jobs=-1, cv=5, verbose=20, scoring="neg_mean_squared_error")
cadence_model_randomForestRegressor.fit(cadence_X_train,cadence_y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_features=0.2, n_estimators=500, min_samples_leaf=50 .........
[CV] max_features=0.2, n_estimators=500, min_samples_leaf=50 .........
[CV] max_features=0.2, n_estimators=500, min_samples_leaf=50 .........
[CV] max_features=0.2, n_estimators=500, min_samples_leaf=50 .........
[CV]  max_features=0.2, n_estimators=500, min_samples_leaf=50, score=-89.530951, total= 4.0min
[CV] max_features=0.2, n_estimators=500, min_samples_leaf=50 .........


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  4.2min


[CV]  max_features=0.2, n_estimators=500, min_samples_leaf=50, score=-212.542765, total= 4.0min
[CV] max_features=0.2, n_estimators=1000, min_samples_leaf=50 ........


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  4.2min


[CV]  max_features=0.2, n_estimators=500, min_samples_leaf=50, score=-114.006973, total= 4.1min
[CV] max_features=0.2, n_estimators=1000, min_samples_leaf=50 ........


[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  4.2min


[CV]  max_features=0.2, n_estimators=500, min_samples_leaf=50, score=-126.551440, total= 4.1min
[CV] max_features=0.2, n_estimators=1000, min_samples_leaf=50 ........


[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  4.2min


[CV]  max_features=0.2, n_estimators=500, min_samples_leaf=50, score=-129.771622, total= 4.0min
[CV] max_features=0.2, n_estimators=1000, min_samples_leaf=50 ........


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  8.3min


[CV]  max_features=0.2, n_estimators=1000, min_samples_leaf=50, score=-211.656828, total= 7.9min
[CV] max_features=0.2, n_estimators=1000, min_samples_leaf=50 ........


[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed: 12.4min


[CV]  max_features=0.2, n_estimators=1000, min_samples_leaf=50, score=-113.891933, total= 8.0min
[CV] max_features=9, n_estimators=500, min_samples_leaf=50 ...........


[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 12.5min


[CV]  max_features=0.2, n_estimators=1000, min_samples_leaf=50, score=-126.640830, total= 8.0min
[CV] max_features=9, n_estimators=500, min_samples_leaf=50 ...........


[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 12.5min


[CV]  max_features=9, n_estimators=500, min_samples_leaf=50, score=-112.298628, total= 2.8min
[CV] max_features=9, n_estimators=500, min_samples_leaf=50 ...........


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 15.4min


[CV]  max_features=9, n_estimators=500, min_samples_leaf=50, score=-133.753604, total= 2.8min
[CV] max_features=9, n_estimators=500, min_samples_leaf=50 ...........


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 15.4min


[CV]  max_features=0.2, n_estimators=1000, min_samples_leaf=50, score=-89.427095, total= 7.8min
[CV] max_features=9, n_estimators=500, min_samples_leaf=50 ...........


[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed: 16.4min


[CV]  max_features=9, n_estimators=500, min_samples_leaf=50, score=-219.572948, total= 2.7min
[CV] max_features=9, n_estimators=1000, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed: 18.3min


[CV]  max_features=9, n_estimators=500, min_samples_leaf=50, score=-92.124738, total= 2.8min
[CV] max_features=9, n_estimators=1000, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed: 18.3min


[CV]  max_features=9, n_estimators=500, min_samples_leaf=50, score=-130.917176, total= 2.7min
[CV] max_features=9, n_estimators=1000, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 19.3min


[CV]  max_features=0.2, n_estimators=1000, min_samples_leaf=50, score=-129.612506, total=53.6min
[CV] max_features=9, n_estimators=1000, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed: 66.4min


[CV]  max_features=9, n_estimators=1000, min_samples_leaf=50, score=-134.012944, total=51.5min
[CV] max_features=9, n_estimators=1000, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 70.1min


[CV]  max_features=9, n_estimators=1000, min_samples_leaf=50, score=-112.463060, total=51.4min
[CV] max_features=12, n_estimators=500, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 70.1min


[CV]  max_features=9, n_estimators=1000, min_samples_leaf=50, score=-220.104186, total=51.4min
[CV] max_features=12, n_estimators=500, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed: 71.1min


[CV]  max_features=9, n_estimators=1000, min_samples_leaf=50, score=-92.128625, total= 5.8min
[CV] max_features=12, n_estimators=500, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed: 72.4min


[CV]  max_features=12, n_estimators=500, min_samples_leaf=50, score=-128.388579, total= 3.5min
[CV] max_features=12, n_estimators=500, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed: 73.8min


[CV]  max_features=12, n_estimators=500, min_samples_leaf=50, score=-113.446601, total= 3.5min
[CV] max_features=12, n_estimators=500, min_samples_leaf=50 ..........


[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed: 74.7min


[CV]  max_features=9, n_estimators=1000, min_samples_leaf=50, score=-131.115660, total= 5.6min
[CV] max_features=12, n_estimators=1000, min_samples_leaf=50 .........


[Parallel(n_jobs=-1)]: Done  22 tasks      | elapsed: 76.0min


[CV]  max_features=12, n_estimators=500, min_samples_leaf=50, score=-214.900433, total= 3.4min
[CV] max_features=12, n_estimators=1000, min_samples_leaf=50 .........


[Parallel(n_jobs=-1)]: Done  23 tasks      | elapsed: 76.0min


[CV]  max_features=12, n_estimators=500, min_samples_leaf=50, score=-90.350147, total= 3.4min
[CV] max_features=12, n_estimators=1000, min_samples_leaf=50 .........
[CV]  max_features=12, n_estimators=500, min_samples_leaf=50, score=-130.351771, total= 3.5min
[CV] max_features=12, n_estimators=1000, min_samples_leaf=50 .........


[Parallel(n_jobs=-1)]: Done  25 out of  30 | elapsed: 78.4min remaining: 15.7min


[CV]  max_features=12, n_estimators=1000, min_samples_leaf=50, score=-113.471876, total=17.6min
[CV] max_features=12, n_estimators=1000, min_samples_leaf=50 .........
[CV]  max_features=12, n_estimators=1000, min_samples_leaf=50, score=-128.270946, total=17.6min


[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed: 94.0min remaining: 10.4min


[CV]  max_features=12, n_estimators=1000, min_samples_leaf=50, score=-214.393493, total=17.3min
[CV]  max_features=12, n_estimators=1000, min_samples_leaf=50, score=-90.120433, total=17.0min
[CV]  max_features=12, n_estimators=1000, min_samples_leaf=50, score=-130.172292, total= 3.8min


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 97.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [500, 1000], 'max_features': [0.2, 9, 12], 'min_samples_leaf': [50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=20)

In [584]:
cadence_model_randomForestRegressor.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.2, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [585]:
cadence_model_randomForestRegressor.best_score_

-134.24583833621844

In [586]:
cadence_model_randomForestRegressor.best_params_

{'max_features': 0.2, 'min_samples_leaf': 50, 'n_estimators': 1000}

## Cadence: Adding an xgboost model:

In [555]:
import xgboost as xgboost



### Ridge

In [409]:
# model_ridge = Ridge()
# # alphas = [1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20, 50, 75]
# cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() 
#             for alpha in alphas]

In [543]:
# cv_ridge = pd.Series(cv_ridge, index = alphas)
# cv_ridge.plot(title = "Validation - Just Do It")
# plt.xlabel("alpha")
# plt.ylabel("rmse")

In [545]:
# cv_ridge.min()

In [413]:
# model_ridge = RidgeCV(alphas=[1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20, 50, 75], normalize=True).fit(X_train,y)

In [544]:
# rmse_cv(mode_ridge).mean()

In [546]:
# # validation curve
# title = "Validation Curve (Ridge)"
# cv = 5 
# plot_validation_curve(Ridge(), title, X_train, y, cv = 5, n_jobs= 4,
#                       param_range=[1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20, 50, 75])

In [547]:
# learning curve
# title = "Learning Curve (Ridge)"
# cv = ShuffleSplit(n_splits=10, test_size = 0.2, random_state=0)
# plot_learning_curve(model_lasso,title, X_train,y, cv = cv, n_jobs=4)

In [548]:
# coef = pd.Series(model_ridge.coef_, index=columns)
# coef


### Lasso
Let's try out the Lasso model

In [380]:
# model_lasso = LassoCV(alphas=[10,5, 1, 0.1, 0.001, 0.0005],normalize=True).fit(X_train, y)

In [549]:
# rmse_cv(model_lasso).mean()

In [550]:
# coef = pd.Series(model_lasso.coef_, index=columns)

In [551]:
# print("Lasso picked " + str(sum(coef !=0)) + " variables and eliminated the other " + 
# str(sum(coef == 0)) + " variables")

In [552]:
# coef

In [553]:
# # take a look directly at what the most important coefficients are:
# imp_coef = pd.concat([coef.sort_values().head(8),coef.sort_values().tail(8)])
# matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
# imp_coef.plot(kind = "barh")
# plt.title("Coefficients in the Lasso Model")

In [554]:
# # look residuals 
# matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
# preds = pd.DataFrame({"preds":model.predict(X_train), "true":y})
# preds["residuals"] = preds["true"] - preds["preds"]
# preds.plot(x = "preds", y = "true", kind = "scatter")
# # preds.plot(x = "preds", y = "residuals", kind = "scatter")

### Predict (Cadence)

In [487]:
X_test

array([[ 0.43695652,  0.57674051,  0.0031211 , ...,  0.        ,
         0.        ,  0.18672426],
       [ 0.36956522,  0.60601266,  0.00202475, ...,  0.        ,
         0.        ,  0.57132757],
       [ 0.36521739,  0.60917722,  0.00692547, ...,  0.        ,
         0.        ,  0.17074478],
       ..., 
       [ 0.35652174,  0.58781646,  0.00473989, ...,  0.        ,
         0.        ,  0.21959634],
       [ 0.37173913,  0.56447785,  0.00751158, ...,  0.        ,
         0.        ,  0.15131518],
       [ 0.45434783,  0.59493671,  0.00420828, ...,  0.        ,
         0.        ,  0.17371835]])

In [588]:
cadence_test.Cadence = cadence_model_randomForestRegressor.predict(cadence_X_test)
cadence_result = cadence_test[['Id','Cadence']]
cadence_result.head()

Unnamed: 0,Id,Cadence
30000,30001.0,73.165053
30001,30002.0,96.268277
30002,30003.0,91.290159
30003,30004.0,88.206047
30004,30005.0,69.46857


In [492]:
cadence_result.to_csv(path_or_buf="../data/cadence_result(randomforest).csv", index=False, sep= ';')

# Part3: Power: Data Preprocessing

In [563]:
power = cadence

In [564]:
power.head()

Unnamed: 0,Average_Gradient,Average_heart_rate,Cadence,Distance,Highest_point,Id,Lowest_point,Max_Gradient,Max_heart_rate,Measured_time,...,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,1/Measured_time
0,0.0,105.8,85.3,9980.4,128.4,,124.6,5.0,114,1375,...,0,0,0,0,0,0,0,0,0,0.000727
1,3.3,157.5,88.3,1666.01,204.6,,147.9,16.0,170,252,...,0,0,0,0,0,0,0,0,0,0.003968
2,0.0,158.9,103.4,345.6,59.0,,59.0,0.0,160,36,...,0,0,0,0,0,0,0,0,0,0.027778
3,-8.3,99.1,66.4,1572.9,617.8,,487.2,-0.5,104,123,...,0,0,0,0,0,0,0,0,0,0.00813
4,-7.6,100.3,60.3,435.352,54.4,,21.4,-4.8,111,59,...,0,0,0,0,0,0,0,0,0,0.016949


In [565]:
power_train = power.dropna(subset=['Cadence','Power'],how = 'any')
power_test = power.loc[cadence['Power'].isnull()]

In [566]:
dropList = ['Power','Id']
power_X_train, power_y = power_train.drop(dropList, axis=1), power_train['Power']
columns = power_X_train.columns
power_X_test = power_test.drop(dropList, axis=1)

In [561]:
# from sklearn.preprocessing import MinMaxScaler
# min_max = MinMaxScaler()
# X_train = min_max.fit_transform(X_train)
# X_test = min_max.transform(X_test)

In [562]:
power_X_train.shape

(120000, 72)

## Power: Models

## Power: RandomForestRegressor

In [567]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rfr = RandomForestRegressor()
param_grid = {'n_estimators': [500,1000], 'max_features': [0.2, 9, 12]}
model_randomForest = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=4, cv=5, verbose=20, scoring="neg_mean_squared_error")
model_randomForest.fit(power_X_train, power_y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_features=0.2, n_estimators=500 ..............................
[CV] max_features=0.2, n_estimators=500 ..............................
[CV] max_features=0.2, n_estimators=500 ..............................
[CV] max_features=0.2, n_estimators=500 ..............................
[CV]  max_features=0.2, n_estimators=500, score=-1942.240303, total= 6.6min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  7.6min


[CV] max_features=0.2, n_estimators=500 ..............................
[CV]  max_features=0.2, n_estimators=500, score=-2583.795866, total= 6.7min


[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:  7.7min


[CV] max_features=0.2, n_estimators=1000 .............................
[CV]  max_features=0.2, n_estimators=500, score=-2266.025260, total= 6.7min


[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:  7.7min


[CV] max_features=0.2, n_estimators=1000 .............................
[CV]  max_features=0.2, n_estimators=500, score=-3227.246768, total= 7.1min


[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:  8.0min


[CV] max_features=0.2, n_estimators=1000 .............................
[CV]  max_features=0.2, n_estimators=500, score=-2147.185858, total=77.2min


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed: 85.4min


[CV] max_features=0.2, n_estimators=1000 .............................
[CV]  max_features=0.2, n_estimators=1000, score=-1937.219476, total=84.1min


[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed: 95.1min


[CV] max_features=0.2, n_estimators=1000 .............................
[CV]  max_features=0.2, n_estimators=1000, score=-2577.241003, total=84.7min


[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed: 95.9min


[CV] max_features=9, n_estimators=500 ................................
[CV]  max_features=0.2, n_estimators=1000, score=-3238.250975, total=85.4min


[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed: 96.7min


[CV] max_features=9, n_estimators=500 ................................
[CV]  max_features=9, n_estimators=500, score=-2008.969261, total= 6.0min


[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed: 103.0min


[CV] max_features=9, n_estimators=500 ................................
[CV]  max_features=0.2, n_estimators=1000, score=-2262.455583, total=15.9min


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed: 103.4min


[CV] max_features=9, n_estimators=500 ................................
[CV]  max_features=9, n_estimators=500, score=-3388.631239, total= 5.9min


[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed: 103.6min


[CV] max_features=9, n_estimators=500 ................................
[CV]  max_features=9, n_estimators=500, score=-2689.753793, total= 5.0min


[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed: 108.7min


[CV] max_features=9, n_estimators=1000 ...............................
[CV]  max_features=9, n_estimators=500, score=-2315.947914, total= 5.1min


[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed: 109.1min


[CV] max_features=9, n_estimators=1000 ...............................
[CV]  max_features=9, n_estimators=500, score=-2221.531676, total= 5.1min


[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed: 109.2min


[CV] max_features=9, n_estimators=1000 ...............................
[CV]  max_features=0.2, n_estimators=1000, score=-2138.286188, total=15.6min


[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed: 111.7min


[CV] max_features=9, n_estimators=1000 ...............................
[CV]  max_features=9, n_estimators=1000, score=-2007.268792, total=10.6min


[Parallel(n_jobs=4)]: Done  16 tasks      | elapsed: 122.6min


[CV] max_features=9, n_estimators=1000 ...............................
[CV]  max_features=9, n_estimators=1000, score=-2687.044098, total=11.3min


[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed: 123.9min


[CV] max_features=12, n_estimators=500 ...............................
[CV]  max_features=9, n_estimators=1000, score=-3379.005170, total=11.7min


[Parallel(n_jobs=4)]: Done  18 tasks      | elapsed: 124.2min


[CV] max_features=12, n_estimators=500 ...............................
[CV]  max_features=9, n_estimators=1000, score=-2316.783829, total=12.5min


[Parallel(n_jobs=4)]: Done  19 tasks      | elapsed: 125.7min


[CV] max_features=12, n_estimators=500 ...............................
[CV]  max_features=12, n_estimators=500, score=-1962.710125, total= 6.0min


[Parallel(n_jobs=4)]: Done  20 tasks      | elapsed: 130.7min


[CV] max_features=12, n_estimators=500 ...............................
[CV]  max_features=12, n_estimators=500, score=-3278.031296, total= 6.3min


[Parallel(n_jobs=4)]: Done  21 tasks      | elapsed: 131.2min


[CV] max_features=12, n_estimators=500 ...............................
[CV]  max_features=12, n_estimators=500, score=-2607.099277, total= 6.2min


[Parallel(n_jobs=4)]: Done  22 tasks      | elapsed: 132.5min


[CV] max_features=12, n_estimators=1000 ..............................
[CV]  max_features=9, n_estimators=1000, score=-2224.598556, total=10.6min


[Parallel(n_jobs=4)]: Done  23 tasks      | elapsed: 134.2min


[CV] max_features=12, n_estimators=1000 ..............................
[CV]  max_features=12, n_estimators=500, score=-2276.843581, total= 5.7min
[CV] max_features=12, n_estimators=1000 ..............................
[CV]  max_features=12, n_estimators=500, score=-2170.550547, total= 5.8min


[Parallel(n_jobs=4)]: Done  25 out of  30 | elapsed: 137.5min remaining: 27.5min


[CV] max_features=12, n_estimators=1000 ..............................
[CV]  max_features=12, n_estimators=1000, score=-1952.551499, total=11.8min
[CV] max_features=12, n_estimators=1000 ..............................
[CV]  max_features=12, n_estimators=1000, score=-3274.335706, total=16.1min


[Parallel(n_jobs=4)]: Done  27 out of  30 | elapsed: 152.0min remaining: 16.9min


[CV]  max_features=12, n_estimators=1000, score=-2608.122988, total=16.2min
[CV]  max_features=12, n_estimators=1000, score=-2276.296852, total=16.1min
[CV]  max_features=12, n_estimators=1000, score=-2162.756212, total=12.8min


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 159.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [500, 1000], 'max_features': [0.2, 9, 12]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=20)

In [568]:
model_randomForest.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.2, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [569]:
model_randomForest.best_score_

-2430.6906451010364

In [570]:
model_randomForest.best_params_

{'max_features': 0.2, 'n_estimators': 1000}

### Power: Predict

In [573]:
power_X_test.head()

Unnamed: 0,Average_Gradient,Average_heart_rate,Cadence,Distance,Highest_point,Lowest_point,Max_Gradient,Max_heart_rate,Measured_time,Moving_time,...,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,1/Measured_time
15000,-0.4,114.8,83.1,9824.8,71.4,25.6,6.1,136,1906,1223,...,0,0,0,0,0,0,0,0,0,0.000525
15001,0.0,145.8,93.2,351.68,32.0,32.0,0.0,150,38,38,...,0,0,0,0,0,0,0,0,0,0.026316
15002,-3.7,117.0,89.6,1099.72,85.9,44.7,-0.6,137,136,136,...,0,0,0,0,0,0,0,0,0,0.007353
15003,1.7,121.4,93.0,574.7,22.0,8.1,9.7,126,75,75,...,0,0,0,0,0,0,0,0,0,0.013333
15004,0.0,112.9,93.5,1035.6,129.4,127.0,14.4,115,133,133,...,0,0,0,0,0,0,0,0,0,0.007519


In [574]:
def power_predict(model):
    power_test.Power = model.predict(power_X_test)
    power_result = power_test[['Id','Power']]
    return power_result
power_result = power_predict(model_randomForest)

In [575]:
power_result.head()

Unnamed: 0,Id,Power
15000,15001.0,123.0967
15001,15002.0,163.793
15002,15003.0,82.9308
15003,15004.0,143.4607
15004,15005.0,105.1403


# Final Result

In [589]:
speed_result = speed_result.rename(columns={'Speed': 'Predictor'})
cadence_result = cadence_result.rename(columns = {'Cadence': 'Predictor'})
power_result = power_result.rename(columns={'Power': 'Predictor'})

In [590]:
result = pd.concat((speed_result,cadence_result,power_result))

In [591]:
result.Id = result.Id.astype("int")

In [592]:
result.to_csv(path_or_buf="../submit/submit3.csv", sep=";",index=False)