# Linear models

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, ElasticNetCV, ElasticNet
from sklearn.pipeline import make_pipeline
import datetime

from helpers import per_station_models_cross_val_mean, load_all_processed_data, ManualFeatureSelector

pd.set_option('display.max_columns', None)

In [78]:
import glob
import os

df = load_all_processed_data()

# if bikes_3h_ago is missing then assume number of bikes hasn't changed
# df['bikes_3h_ago'] = df['bikes_3h_ago'].fillna(df['bikes'])

# # use station & weekhour group means to fill in missing profile and 3h ago values
for column in ['bikes_3h_ago', 'full_profile_3h_diff_bikes', 'full_profile_bikes']:
    df[column] = df.groupby(['station', 'weekhour'])[column].transform(lambda x: x.fillna(x.mean()))

In [35]:
def single_model_regr(lin_model=LinearRegression()):
    features = ['bikes_3h_ago', 'full_profile_3h_diff_bikes', 'full_profile_bikes', 'weekhour', 'temperature.C', 'windMeanSpeed.m.s', 'relHumidity.HR', 'airPressure.mb', 'hour', 'day', 'windMaxSpeed.m.s']
    return make_pipeline(ManualFeatureSelector(features), LinearRegression(normalize=False))

def per_station_model_regr(lin_model=LinearRegression()):
    features = ['bikes_3h_ago', 'full_profile_3h_diff_bikes', 'full_profile_bikes', 'weekhour', 'temperature.C', 'windMeanSpeed.m.s', 'relHumidity.HR', 'airPressure.mb', 'hour', 'day', 'windMaxSpeed.m.s']
    return make_pipeline(ManualFeatureSelector(features), lin_model)

def single_model_cv_score(df, lin_model):
    
    X = df.drop(columns=['bikes'])
    y = df['bikes']

    scores = cross_val_score(single_model_regr(lin_model), X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

    print(scores)
    print(scores.mean())
    return scores

def per_station_models_cv_score(df, lin_model):
    score = per_station_models_cross_val_mean(per_station_model_regr(lin_model), df)
    print(score)
    
    return score

## Default LinearRegression

In [79]:
X = df.drop(columns=['bikes'])
y = df['bikes']

scores = cross_val_score(single_model_regr(), X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-2.43335519 -2.43297062 -2.41752298 -2.43890457 -2.41412534]
-2.4273757388233688


In [80]:
score = per_station_models_cross_val_mean(per_station_model_regr(), df)
print(score)

-2.336942713207018


In [56]:
X = df.drop(columns=['bikes'])
y = df['bikes']

parameters = {'linearregression__normalize': [True, False]}

search = GridSearchCV(single_model_regr(LinearRegression()), param_grid=parameters, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')
search.fit(X, y)

print(search.cv_results_)

print(search.best_score_)

print(search.best_params_)

print(search.best_estimator_)

{'mean_fit_time': array([0.03203363, 0.02397718]), 'std_fit_time': array([0.01541407, 0.00275771]), 'mean_score_time': array([0.00546017, 0.00459309]), 'std_score_time': array([0.00204339, 0.00096646]), 'param_linearregression__normalize': masked_array(data=[True, False],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'linearregression__normalize': True}, {'linearregression__normalize': False}], 'split0_test_score': array([-2.41291342, -2.41291342]), 'split1_test_score': array([-2.43768742, -2.43768742]), 'split2_test_score': array([-2.42807828, -2.42807828]), 'split3_test_score': array([-2.4159575, -2.4159575]), 'split4_test_score': array([-2.39189334, -2.39189334]), 'mean_test_score': array([-2.41730599, -2.41730599]), 'std_test_score': array([0.01548135, 0.01548135]), 'rank_test_score': array([1, 2], dtype=int32)}
-2.4173059920145703
{'linearregression__normalize': True}
Pipeline(steps=[('manualfeatureselector',
                 <help

In [94]:
searches = {}
for station in np.arange(201, 276):
    stationdf = df[df['station'] == station]
    
    X = stationdf.drop(columns=['bikes'])
    y = stationdf['bikes']

    parameters = {'linearregression__normalize': [True, False]}

    search = GridSearchCV(per_station_model_regr(LinearRegression()), param_grid=parameters, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')
    search.fit(X, y)

    searches[station] = search
[(station, (search.best_params_, search.best_score_), search.cv_results_['mean_test_score']) for station, search in searches.items()]

[(201,
  ({'linearregression__normalize': False}, -2.489021220428003),
  array([-2.48902122, -2.48902122])),
 (202,
  ({'linearregression__normalize': True}, -1.972826651576731),
  array([-1.97282665, -1.97282665])),
 (203,
  ({'linearregression__normalize': True}, -2.422635535280463),
  array([-2.42263554, -2.42263554])),
 (204,
  ({'linearregression__normalize': True}, -2.9992656910699895),
  array([-2.99926569, -2.99926569])),
 (205,
  ({'linearregression__normalize': True}, -2.252071983838278),
  array([-2.25207198, -2.25207198])),
 (206,
  ({'linearregression__normalize': False}, -2.698560427068171),
  array([-2.69856043, -2.69856043])),
 (207,
  ({'linearregression__normalize': True}, -2.0809409336928772),
  array([-2.08094093, -2.08094093])),
 (208,
  ({'linearregression__normalize': False}, -3.131838683431905),
  array([-3.13183868, -3.13183868])),
 (209,
  ({'linearregression__normalize': False}, -2.3454444195811197),
  array([-2.34544442, -2.34544442])),
 (210,
  ({'linearreg

## Lasso regression

In [19]:
X = df.drop(columns=['bikes'])
y = df['bikes']

scores = cross_val_score(single_model_regr(Lasso(alpha=0.01)), X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-2.41515197 -2.40386371 -2.39888559 -2.39689721 -2.47096498]
-2.4171526916579973


In [20]:
score = per_station_models_cross_val_mean(per_station_model_regr(Lasso(alpha=0.01)), df)
print(score)

-2.32435340269371


In [54]:
X = df.drop(columns=['bikes'])
y = df['bikes']

parameters = {'lasso__alpha': [0.001, 0.003, 0.01, 0.03, 0.1, 0.5, 1.0]}

search = GridSearchCV(single_model_regr(Lasso()), param_grid=parameters, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')
search.fit(X, y)

print(search.cv_results_)

print(search.best_score_)

print(search.best_params_)

print(search.best_estimator_)

{'mean_fit_time': array([0.04465775, 0.03657246, 0.0259182 , 0.02039552, 0.02119589,
       0.02045512, 0.01943316]), 'std_fit_time': array([0.00500651, 0.0053983 , 0.00518419, 0.00144854, 0.00080928,
       0.00094218, 0.00178935]), 'mean_score_time': array([0.00476336, 0.00540304, 0.00547543, 0.00501003, 0.00539527,
       0.00581942, 0.00434175]), 'std_score_time': array([0.00061104, 0.00025156, 0.00039752, 0.00094059, 0.00075015,
       0.00035537, 0.00042646]), 'param_lasso__alpha': masked_array(data=[0.001, 0.003, 0.01, 0.03, 0.1, 0.5, 1.0],
             mask=[False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'lasso__alpha': 0.001}, {'lasso__alpha': 0.003}, {'lasso__alpha': 0.01}, {'lasso__alpha': 0.03}, {'lasso__alpha': 0.1}, {'lasso__alpha': 0.5}, {'lasso__alpha': 1.0}], 'split0_test_score': array([-2.39970273, -2.3997532 , -2.39990078, -2.40062037, -2.40380044,
       -2.42452229, -2.4549259 ]), 'split1_test_score':

In [89]:
searches = {}
for station in np.arange(201, 276):
    stationdf = df[df['station'] == station]
    
    X = stationdf.drop(columns=['bikes'])
    y = stationdf['bikes']

    parameters = {'lasso__normalize': [True, False], 'lasso__alpha': [0.001, 0.1, 0.5, 1.0]}

    search = GridSearchCV(per_station_model_regr(Lasso()), param_grid=parameters, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')
    search.fit(X, y)

    searches[station] = search

[(station, (search.best_params_, search.best_score_), search.cv_results_['mean_test_score']) for station, search in searches.items()]

[(201,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -2.4933259126470233),
  array([-2.49332591, -2.49504106, -3.46654826, -2.49381545, -3.47520391,
         -2.49829214, -3.47520391, -2.51925833])),
 (202,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -1.95970658884831),
  array([-1.95970659, -1.96670218, -3.15363503, -1.96512868, -3.15363503,
         -1.97308238, -3.15363503, -2.01181815])),
 (203,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -2.3968794337695796),
  array([-2.39687943, -2.41640595, -3.38511138, -2.40476134, -4.03146077,
         -2.39991603, -4.03146077, -2.42138222])),
 (204,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -2.9865216999315125),
  array([-2.9865217 , -3.00538306, -4.27487033, -2.99387231, -6.0625337 ,
         -2.98995493, -6.0625337 , -3.00043747])),
 (205,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -2.220772629300091),
  array([-2.22077263, -2.22437428, -3.21293135, -2.22268126, -3.21293135,
         -2.24

In [90]:
searches = {}
for station in np.arange(201, 276):
    stationdf = df[df['station'] == station]
    
    X = stationdf.drop(columns=['bikes'])
    y = stationdf['bikes']

    parameters = {'lasso__alpha': [0.001, 0.1, 0.5, 1.0]}

    search = GridSearchCV(per_station_model_regr(Lasso()), param_grid=parameters, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')
    search.fit(X, y)

    searches[station] = search
[(station, (search.best_params_, search.best_score_), search.cv_results_['mean_test_score']) for station, search in searches.items()]

[(201,
  ({'lasso__alpha': 0.1}, -2.478700997106789),
  array([-2.48372294, -2.478701  , -2.48801448, -2.50994854])),
 (202,
  ({'lasso__alpha': 0.1}, -1.9665661986956935),
  array([-1.96780939, -1.9665662 , -1.97998236, -2.01640997])),
 (203,
  ({'lasso__alpha': 0.1}, -2.388222944415845),
  array([-2.39561503, -2.38822294, -2.39273317, -2.41601396])),
 (204,
  ({'lasso__alpha': 0.5}, -3.0031885386579487),
  array([-3.0082838 , -3.00438366, -3.00318854, -3.00918664])),
 (205,
  ({'lasso__alpha': 0.001}, -2.223971336478347),
  array([-2.22397134, -2.22497123, -2.24308474, -2.28554371])),
 (206,
  ({'lasso__alpha': 0.001}, -2.7244182730554742),
  array([-2.72441827, -2.72879201, -2.74875375, -2.77543037])),
 (207,
  ({'lasso__alpha': 0.1}, -2.053448421566981),
  array([-2.06163711, -2.05344842, -2.05677077, -2.07694411])),
 (208,
  ({'lasso__alpha': 0.5}, -3.11396534801913),
  array([-3.1287442 , -3.11819014, -3.11396535, -3.11931825])),
 (209,
  ({'lasso__alpha': 0.1}, -2.34471210275864

In [92]:
searches = {}
for station in np.arange(201, 276):
    stationdf = df[df['station'] == station]
    
    X = stationdf.drop(columns=['bikes'])
    y = stationdf['bikes']

    parameters = {'lasso__normalize': [True, False], 'lasso__alpha': [0.001]}

    search = GridSearchCV(per_station_model_regr(Lasso()), param_grid=parameters, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')
    search.fit(X, y)

    searches[station] = search

[(station, (search.best_params_, search.best_score_), search.cv_results_['mean_test_score']) for station, search in searches.items()]

[(201,
  ({'lasso__alpha': 0.001, 'lasso__normalize': False}, -2.4855195228538323),
  array([-2.48642566, -2.48551952])),
 (202,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -1.9616915564373614),
  array([-1.96169156, -1.9693826 ])),
 (203,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -2.4021379024965697),
  array([-2.4021379 , -2.41887106])),
 (204,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -2.9855313278638316),
  array([-2.98553133, -3.00443653])),
 (205,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -2.229654824739949),
  array([-2.22965482, -2.23502675])),
 (206,
  ({'lasso__alpha': 0.001, 'lasso__normalize': False}, -2.700522523131986),
  array([-2.70379384, -2.70052252])),
 (207,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -2.0747680164010696),
  array([-2.07476802, -2.08219459])),
 (208,
  ({'lasso__alpha': 0.001, 'lasso__normalize': True}, -3.1131672750624313),
  array([-3.11316728, -3.12880809])),
 (209,
  ({'lasso__alpha': 0.001

### Normalization only helps when not doing much regularization but even then barely any different in CV score

## Ridge regression

In [25]:
X = df.drop(columns=['bikes'])
y = df['bikes']

scores = cross_val_score(single_model_regr(Ridge(alpha=1.0)), X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-2.45740156 -2.38552856 -2.42225442 -2.40877384 -2.41175036]
-2.4171417479861765


In [31]:
score = per_station_models_cross_val_mean(per_station_model_regr(Ridge(alpha=10)), df)
print(score)

-2.32453362881079


In [59]:
X = df.drop(columns=['bikes'])
y = df['bikes']

parameters = {'ridge__normalize': [True, False], 'ridge__alpha': [0.001, 0.003, 0.01, 0.03, 0.1, 0.5, 1.0]}

search = GridSearchCV(single_model_regr(Ridge()), param_grid=parameters, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')
search.fit(X, y)

print(search.cv_results_)

print(search.best_score_)

print(search.best_params_)

print(search.best_estimator_)

{'mean_fit_time': array([0.02421355, 0.02090359, 0.0227541 , 0.02311668, 0.02376261,
       0.02217765, 0.02465906, 0.02134199, 0.02366214, 0.02264829,
       0.02474718, 0.02074552, 0.02331409, 0.02199955]), 'std_fit_time': array([0.00256751, 0.00266658, 0.00208855, 0.00126306, 0.00134789,
       0.00180261, 0.00087475, 0.00124641, 0.00043195, 0.00186652,
       0.00105898, 0.00148889, 0.00168908, 0.00169786]), 'mean_score_time': array([0.00473709, 0.00449462, 0.00517917, 0.00492377, 0.00502081,
       0.00472555, 0.00540304, 0.00456939, 0.00497251, 0.00503812,
       0.00512018, 0.00423212, 0.00473614, 0.00461392]), 'std_score_time': array([0.00098368, 0.00067206, 0.00121417, 0.00040382, 0.00092221,
       0.00072844, 0.00055938, 0.00052894, 0.00053537, 0.00032306,
       0.00058239, 0.00014248, 0.00072334, 0.00067837]), 'param_ridge__alpha': masked_array(data=[0.001, 0.001, 0.003, 0.003, 0.01, 0.01, 0.03, 0.03,
                   0.1, 0.1, 0.5, 0.5, 1.0, 1.0],
             mask=[Fal

In [93]:
searches = {}
for station in np.arange(201, 276):
    stationdf = df[df['station'] == station]
    
    X = stationdf.drop(columns=['bikes'])
    y = stationdf['bikes']

    parameters = {'ridge__alpha': [0.001, 0.1, 0.5, 1.0]}

    search = GridSearchCV(per_station_model_regr(Ridge()), param_grid=parameters, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')
    search.fit(X, y)

    searches[station] = search
[(station, (search.best_params_, search.best_score_), search.cv_results_['mean_test_score']) for station, search in searches.items()]

[(201,
  ({'ridge__alpha': 1.0}, -2.520952721244874),
  array([-2.52098562, -2.52098236, -2.52096918, -2.52095272])),
 (202,
  ({'ridge__alpha': 1.0}, -1.9658456974672132),
  array([-1.9658558 , -1.96585479, -1.96585074, -1.9658457 ])),
 (203,
  ({'ridge__alpha': 0.001}, -2.414442304191269),
  array([-2.4144423 , -2.41444332, -2.41444744, -2.41445259])),
 (204,
  ({'ridge__alpha': 1.0}, -2.9989481796791337),
  array([-2.9989544 , -2.99895378, -2.99895129, -2.99894818])),
 (205,
  ({'ridge__alpha': 0.001}, -2.2392755963020634),
  array([-2.2392756 , -2.23927704, -2.2392829 , -2.23929021])),
 (206,
  ({'ridge__alpha': 0.001}, -2.6918911293872703),
  array([-2.69189113, -2.69189204, -2.69189572, -2.69190032])),
 (207,
  ({'ridge__alpha': 1.0}, -2.1069808094596825),
  array([-2.10698093, -2.10698092, -2.10698087, -2.10698081])),
 (208,
  ({'ridge__alpha': 1.0}, -3.161053132723411),
  array([-3.16108237, -3.16107947, -3.16106776, -3.16105313])),
 (209,
  ({'ridge__alpha': 0.001}, -2.3356212

## Test submissions

In [42]:
def single_model_submission(model, df, filename):
    dft = pd.read_csv('test.csv')

    X = df.drop(columns=['bikes'])
    y = df['bikes']

    model.fit(X,y)
    dft['bikes'] = model.predict(dft).clip(min=0)
    dft['bikes'] = dft[['bikes', 'numDocks']].min(axis=1)
    dft[['Id', 'bikes']].to_csv(filename,index=False)

In [162]:
def per_station_model_submission(model, df, filename):
    dft = pd.read_csv('test.csv')

    predictions_dfs = []
    
    for station in np.arange(201, 276):
        stationdf = df[df['station'] == station]
        test_stationdf = dft[dft['station'] == station].copy()

        X = stationdf.drop(columns=['bikes'])
        y = stationdf.bikes

        model = model.fit(X,y)
        test_stationdf['bikes'] = model.predict(test_stationdf).clip(min=0)
        test_stationdf['bikes'] = test_stationdf[['bikes', 'numDocks']].min(axis=1)
        predictions_dfs.append(test_stationdf[['Id', 'bikes']])

    predictions_df = pd.concat(predictions_dfs)

    predictions_df.to_csv(filename,index=False)

In [157]:
for alpha in [0.01, 1.0]:
    time = datetime.now().strftime("%d%H%M%S")
    filename = f'Predictions/single_model_submission_lasso_{alpha}_clipped_{time}.csv'
    single_model_submission(single_model_regr(Lasso(alpha=alpha)), df, filename)

In [158]:
for alpha in [0.01, 0.1, 1.0]:
    time = datetime.now().strftime("%d%H%M%S")
    filename = f'Predictions/single_model_submission_ridge_{alpha}_clipped_{time}.csv'
    single_model_submission(single_model_regr(Ridge(alpha=alpha)), df, filename)

In [163]:
time = datetime.now().strftime("%d%H%M%S")
filename = f'Predictions/single_model_submission_linreg_normalized_clipped_{time}.csv'
single_model_submission(single_model_regr(LinearRegression(normalize=True)), df, filename)

In [81]:
# from helpers import load_all_phase3_data
# df3 = load_all_phase3_data()
time = datetime.datetime.now().strftime("%d%H%M%S")
filename = f'Predictions/single_model_submission_linreg_unnormalized_clipped_{time}.csv'
single_model_submission(single_model_regr(LinearRegression(normalize=False)), df, filename)

In [154]:
for alpha in [0.01, 1.0]:
    time = datetime.now().strftime("%d%H%M%S")
    filename = f'Predictions/per_station_models_submission_lasso_{alpha}_clipped_{time}.csv'
    per_station_model_submission(per_station_model_regr(Lasso(alpha=alpha)), df, filename)

In [155]:
for alpha in [0.01, 0.1, 1.0]:
    time = datetime.now().strftime("%d%H%M%S")
    filename = f'Predictions/per_station_models_submission_ridge_{alpha}_clipped_{time}.csv'
    per_station_model_submission(per_station_model_regr(Ridge(alpha=alpha)), df, filename)

In [165]:
time = datetime.now().strftime("%d%H%M%S")
filename = f'Predictions/per_station_models_submission_linreg_normalized_clipped_{time}.csv'
per_station_model_submission(per_station_model_regr(LinearRegression(normalize=True)), df, filename)

In [166]:
time = datetime.now().strftime("%d%H%M%S")
filename = f'Predictions/per_station_models_submission_linreg_unnormalized_clipped_{time}.csv'
per_station_model_submission(per_station_model_regr(LinearRegression(normalize=False)), df, filename)

In [63]:
another = pd.read_csv('Predictions/single_model_submission_linreg_unnormalized_clipped_11133937.csv')

In [82]:
original = pd.read_csv('Predictions/single_model_submission_linreg_unnormalized_clipped_11134726.csv')

In [84]:
(another['bikes'] - original['bikes']).abs().max()

1.2434497875801753e-13

In [55]:
anotherp3 = pd.read_csv('Predictions/single_model_submission_linreg_unnormalized_clipped_11133615.csv')

In [64]:
(another['bikes'] - anotherp3['bikes']).abs().max()

8.526512829121202e-14

In [57]:
features = ['bikes_3h_ago', 'full_profile_3h_diff_bikes', 'full_profile_bikes', 'weekhour', 'temperature.C', 'windMeanSpeed.m.s', 'relHumidity.HR', 'airPressure.mb', 'hour', 'day', 'windMaxSpeed.m.s']


In [59]:
df3.sort_values(['station', 'timestamp'], inplace=True)

In [60]:
df.sort_values(['station', 'timestamp'], inplace=True)

In [76]:
df[['station', 'timestamp']+features]

Unnamed: 0,station,timestamp,bikes_3h_ago,full_profile_3h_diff_bikes,full_profile_bikes,weekhour,temperature.C,windMeanSpeed.m.s,relHumidity.HR,airPressure.mb,hour,day,windMaxSpeed.m.s
48360,201,1.412114e+09,1.0,4.222222,3.229167,49,21.3,3.2,85.0,855.3,0,1,11.3
48361,201,1.412118e+09,0.0,-0.222222,1.666667,50,21.1,0.0,86.0,1000.6,1,1,1.6
48362,201,1.412122e+09,0.0,-2.277778,1.604167,51,20.9,0.0,86.0,880.6,2,1,1.6
48363,201,1.412125e+09,1.0,-1.833333,1.395833,52,20.4,0.0,88.0,859.8,3,1,0.0
48364,201,1.412129e+09,0.0,-0.541667,1.125000,53,20.3,3.2,87.0,898.1,4,1,6.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8179,275,1.414778e+09,15.0,-0.500000,7.000000,116,20.2,4.8,80.0,1023.5,19,31,9.7
8180,275,1.414782e+09,15.0,1.750000,8.750000,117,20.2,1.6,82.0,1023.9,20,31,6.4
8181,275,1.414786e+09,15.0,3.250000,10.500000,118,20.0,0.0,83.0,1024.0,21,31,3.2
8182,275,1.414789e+09,15.0,3.000000,10.000000,119,19.3,0.0,84.0,1024.4,22,31,0.0


In [77]:
df3[['station', 'timestamp']+features]

Unnamed: 0,station,timestamp,bikes_3h_ago,full_profile_3h_diff_bikes,full_profile_bikes,weekhour,temperature.C,windMeanSpeed.m.s,relHumidity.HR,airPressure.mb,hour,day,windMaxSpeed.m.s
10416,201,1.412114e+09,2.00,4.222222,3.229167,49,21.3,3.2,85.0,855.3,0,1,11.3
10417,201,1.412118e+09,4.50,-0.222222,1.666667,50,21.1,0.0,86.0,1000.6,1,1,1.6
10418,201,1.412122e+09,4.75,-2.277778,1.604167,51,20.9,0.0,86.0,880.6,2,1,1.6
10419,201,1.412125e+09,1.00,-1.833333,1.395833,52,20.4,0.0,88.0,859.8,3,1,0.0
10420,201,1.412129e+09,0.00,-0.541667,1.125000,53,20.3,3.2,87.0,898.1,4,1,6.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34963,275,1.414778e+09,15.00,-0.500000,7.000000,116,20.2,4.8,80.0,1023.5,19,31,9.7
34964,275,1.414782e+09,15.00,1.750000,8.750000,117,20.2,1.6,82.0,1023.9,20,31,6.4
34965,275,1.414786e+09,15.00,3.250000,10.500000,118,20.0,0.0,83.0,1024.0,21,31,3.2
34966,275,1.414789e+09,15.00,3.000000,10.000000,119,19.3,0.0,84.0,1024.4,22,31,0.0
