In [583]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt

from scipy.stats import uniform, randint
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler

In [584]:
df_for_cv_interaction_and_dummies = pd.read_pickle("../data/df_for_cv_interaction_and_dummies.pkl")
all_var_x = [each for each in list(df_for_cv_interaction_and_dummies) if each != "count" and each != "datetime"]
X_train = df_for_cv_interaction_and_dummies[all_var_x]
Y_train = df_for_cv_interaction_and_dummies[["count"]]

In [585]:
all_numeric_to_scale = [name_var for name_var in list(X_train) if name_var[0:1] == 'N'] 
all_other_vars = [name_var for name_var in list(X_train) if name_var[0:1] != 'N'] 

In [586]:
X_train.head()

Unnamed: 0,NO_temp,NO_atemp,NO_humidity,NO_windspeed,NN_soleil,NN_tendance_1,NN_tendance_2,CN_school_0,CN_school_1,"CN_soleilQ_(-27.9,1.22]",...,CO_hour_+_CO_year_5_2011,CO_hour_+_CO_year_5_2012,CO_hour_+_CO_year_6_2011,CO_hour_+_CO_year_6_2012,CO_hour_+_CO_year_7_2011,CO_hour_+_CO_year_7_2012,CO_hour_+_CO_year_8_2011,CO_hour_+_CO_year_8_2012,CO_hour_+_CO_year_9_2011,CO_hour_+_CO_year_9_2012
0,9.84,14.395,81,0.0,-68.036186,0.418093,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9.02,13.635,80,0.0,-73.948688,0.418093,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9.02,13.635,80,0.0,-71.066026,0.418093,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9.84,14.395,75,0.0,-62.016308,0.418093,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9.84,14.395,75,0.0,-50.973158,0.418093,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [587]:
all_numeric_to_scale

['NO_temp',
 'NO_atemp',
 'NO_humidity',
 'NO_windspeed',
 'NN_soleil',
 'NN_tendance_1',
 'NN_tendance_2']

In [588]:
# On scale les variables 
scaler = StandardScaler()
X_train_scaled = pd.DataFrame.from_records(scaler.fit_transform(X_train[all_numeric_to_scale]))

In [589]:
X_train_scaled.shape

(10886, 7)

In [590]:
X_train_scaled.columns = all_numeric_to_scale

In [591]:
X_train_scaled.head()

Unnamed: 0,NO_temp,NO_atemp,NO_humidity,NO_windspeed,NN_soleil,NN_tendance_1,NN_tendance_2
0,-1.333661,-1.092737,0.993213,-1.567754,-1.873567,-1.161023,-1.709123
1,-1.438907,-1.182421,0.941249,-1.567754,-2.035115,-1.161023,-1.709123
2,-1.438907,-1.182421,0.941249,-1.567754,-1.956352,-1.161023,-1.709123
3,-1.333661,-1.092737,0.68143,-1.567754,-1.709086,-1.161023,-1.709123
4,-1.333661,-1.092737,0.68143,-1.567754,-1.407354,-1.161023,-1.709123


In [592]:
X_train = pd.concat([X_train_scaled,X_train[all_other_vars]], axis = 1)

In [593]:
X_train.head()

Unnamed: 0,NO_temp,NO_atemp,NO_humidity,NO_windspeed,NN_soleil,NN_tendance_1,NN_tendance_2,CN_school_0,CN_school_1,"CN_soleilQ_(-27.9,1.22]",...,CO_hour_+_CO_year_5_2011,CO_hour_+_CO_year_5_2012,CO_hour_+_CO_year_6_2011,CO_hour_+_CO_year_6_2012,CO_hour_+_CO_year_7_2011,CO_hour_+_CO_year_7_2012,CO_hour_+_CO_year_8_2011,CO_hour_+_CO_year_8_2012,CO_hour_+_CO_year_9_2011,CO_hour_+_CO_year_9_2012
0,-1.333661,-1.092737,0.993213,-1.567754,-1.873567,-1.161023,-1.709123,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1.438907,-1.182421,0.941249,-1.567754,-2.035115,-1.161023,-1.709123,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1.438907,-1.182421,0.941249,-1.567754,-1.956352,-1.161023,-1.709123,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1.333661,-1.092737,0.68143,-1.567754,-1.709086,-1.161023,-1.709123,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1.333661,-1.092737,0.68143,-1.567754,-1.407354,-1.161023,-1.709123,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [594]:
# Calcul des Y = log(Y + 1)
Y_train['log_count'] = Y_train['count'].map(lambda x: np.log1p(x))
Y_train = Y_train.drop("count", axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [595]:
# Hyperparamètre de régularisation
# On fait un "random avec : alpha = 0.01 to 100 
params = {"alpha": randint(0, 100)}

In [488]:
# def MSE(a,b):
    # On a eu un pb avec "axis = 0"
    # resultat = np.mean((a - b)**2, axis=0)
    # On a remplacé par :
    # resultat = np.sum((a - b)**2)/(a.shape[0])
    # Et ca marchait pas
    # CA A FONCTIONNE AVEC
    # resultat = np.mean((a - b)**2)
    # return resultat
# mse = make_scorer(MSE, greater_is_better=False)

In [596]:
lasso_model = Lasso()

In [490]:
# search_custom = RandomizedSearchCV(lasso_model,
                            # scoring = mse,
                            # param_distributions=params,
                            # random_state=42,
                            # n_iter=1,
                            # cv=2)

In [597]:
search_official = RandomizedSearchCV(lasso_model,
                            scoring="neg_mean_squared_error",
                            n_jobs = 2,
                            param_distributions=params,
                            random_state=42,
                            n_iter=20,
                            cv=5,
                            return_train_score=True)

In [578]:
search_official = RandomizedSearchCV(lasso_model,
                            scoring="neg_mean_squared_error",
                            n_jobs = 2,
                            param_distributions=params,
                            random_state=42,
                            n_iter=30,
                            cv=5,
                            return_train_score=True)

In [598]:
%%time
search_official.fit(X_train, Y_train)

CPU times: user 4 s, sys: 1.48 s, total: 5.48 s
Wall time: 3min 34s


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                                   max_iter=1000, normalize=False,
                                   positive=False, precompute=False,
                                   random_state=None, selection='cyclic',
                                   tol=0.0001, warm_start=False),
                   iid='warn', n_iter=20, n_jobs=2,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f37a0ea4ef0>},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=True, scoring='neg_mean_squared_error',
                   verbose=0)

In [498]:
# Meilleur modèle avec "alpha" = 1
pd.DataFrame(search_official.cv_results_).sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
16,3.853251,0.532873,0.180556,0.0146,1,{'alpha': 1},-1.021454,-1.046168,-1.040043,-1.122841,...,-1.042223,0.046358,1,-1.02428,-1.00335,-1.00468,-0.987557,-1.021652,-1.008304,0.013428
13,3.365272,0.041652,0.174371,0.006572,2,{'alpha': 2},-1.030761,-1.046295,-1.048756,-1.128727,...,-1.048821,0.045226,2,-1.033209,-1.01182,-1.013057,-0.996485,-1.029812,-1.016876,0.013342
2,3.048855,0.065099,0.172315,0.006587,14,{'alpha': 14},-1.409847,-1.173847,-1.227547,-1.265455,...,-1.248632,0.088411,3,-1.215241,-1.18306,-1.18526,-1.168245,-1.198376,-1.190037,0.015819
5,3.040158,0.050824,0.189684,0.030378,20,{'alpha': 20},-1.725501,-1.309592,-1.399541,-1.432442,...,-1.437953,0.151003,4,-1.380038,-1.338358,-1.345019,-1.332324,-1.354002,-1.349948,0.01668
14,3.128148,0.093154,0.203957,0.057234,21,{'alpha': 21},-1.786352,-1.337284,-1.434264,-1.467468,...,-1.475812,0.162727,5,-1.413159,-1.36957,-1.377128,-1.365301,-1.38528,-1.382088,0.016963


In [600]:
pd.DataFrame(search_official.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,2.927109,0.132553,0.175332,0.004847,51,{'alpha': 51},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
17,2.884309,0.085892,0.172442,0.018139,87,{'alpha': 87},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
16,3.049204,0.333846,0.187869,0.017466,1,{'alpha': 1},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
15,2.866067,0.070361,0.184392,0.01882,52,{'alpha': 52},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
14,2.85752,0.057673,0.17698,0.002962,21,{'alpha': 21},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
13,3.485851,0.224634,0.200128,0.046206,2,{'alpha': 2},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
12,3.426348,0.218358,0.18914,0.013999,23,{'alpha': 23},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
11,3.352258,0.489416,0.233386,0.07933,99,{'alpha': 99},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
10,3.270917,0.372853,0.178715,0.004153,87,{'alpha': 87},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
9,3.679385,0.381559,0.235672,0.062311,74,{'alpha': 74},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951


In [505]:
# C'est le RMLSE sur "train" de la cross validation
result_model = pd.DataFrame(search_official.cv_results_)
result_model[result_model["rank_test_score"] == 1][["mean_train_score"]].values[0][0]*(-1)

1.0083038855715174

In [506]:
search_official.best_score_ * (-1)

1.04222268809796

In [580]:
result_model = pd.DataFrame(search_official.cv_results_)
result_model[result_model["rank_test_score"] == 1][["mean_train_score"]].values[0][0]*(-1)

2.0048355249329206

In [581]:
search_official.best_score_ * (-1)

2.1031157265641283

In [582]:
pd.DataFrame(search_official.cv_results_).sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,2.87248,0.076601,0.174857,0.009843,51,{'alpha': 51},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
27,3.644769,0.180307,0.199428,0.013893,21,{'alpha': 21},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
26,3.894675,0.56703,0.188059,0.02299,57,{'alpha': 57},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
25,3.222509,0.361513,0.217752,0.056619,75,{'alpha': 75},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951
24,2.895415,0.066737,0.170471,0.002261,32,{'alpha': 32},-2.522337,-1.731911,-1.993927,-2.190885,...,-2.103116,0.258431,1,-1.920642,-2.086011,-2.023788,-1.982916,-2.010821,-2.004836,0.053951


In [18]:
# %%time
# model = Lasso()
# parameters = {'alpha':[0.1,0.5,1,5]}
# grid = GridSearchCV(model,parameters, cv=5)
# grid.fit(X_train, Y_train)

1027.274558544159


In [26]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,3.266942,0.399025,0.179292,0.00788,51,{'alpha': 51},-0.353322,-0.048172,-0.043802,-0.092302,...,-0.128257,0.114995,3,0.0,0.060751,0.048074,0.023277,0.006526,0.027726,0.023421
1,3.623007,0.198858,0.223998,0.044502,92,{'alpha': 92},-0.353322,-0.00857,-0.041142,-0.12653,...,-0.131021,0.120464,4,0.0,0.002555,0.0,0.0,0.0,0.000511,0.001022
2,3.739464,0.412501,0.216594,0.051209,14,{'alpha': 14},0.243568,0.316976,0.359028,0.349316,...,0.327326,0.045276,1,0.367274,0.43286,0.414336,0.410845,0.404036,0.40587,0.021531
3,2.989652,0.082957,0.188252,0.006649,71,{'alpha': 71},-0.353322,-0.025187,-0.042293,-0.12653,...,-0.134574,0.117044,10,0.0,0.036532,0.026125,0.0,0.0,0.012531,0.015697
4,3.016401,0.092772,0.190433,0.011403,60,{'alpha': 60},-0.353322,-0.036964,-0.043092,-0.108292,...,-0.133442,0.115353,7,0.0,0.050835,0.039088,0.012848,0.0,0.020554,0.020807
5,3.121423,0.097027,0.187988,0.015944,20,{'alpha': 20},0.074208,0.237991,0.269221,0.263453,...,0.225587,0.077105,2,0.28147,0.358413,0.335395,0.328099,0.326642,0.326004,0.025006
6,3.00121,0.078922,0.195664,0.012466,82,{'alpha': 82},-0.353322,-0.015522,-0.041571,-0.12653,...,-0.132497,0.119009,6,0.0,0.019826,0.010986,0.0,0.0,0.006162,0.008049
7,2.946474,0.128316,0.18524,0.009578,86,{'alpha': 86},-0.353322,-0.012532,-0.041327,-0.12653,...,-0.13185,0.119638,5,0.0,0.013156,0.004941,0.0,0.0,0.003619,0.005138
8,2.958719,0.129473,0.185191,0.005095,74,{'alpha': 74},-0.353322,-0.022341,-0.042089,-0.12653,...,-0.133964,0.117612,8,0.0,0.032214,0.022212,0.0,0.0,0.010885,0.013702
9,3.161443,0.189297,0.188131,0.019013,74,{'alpha': 74},-0.353322,-0.022341,-0.042089,-0.12653,...,-0.133964,0.117612,8,0.0,0.032214,0.022212,0.0,0.0,0.010885,0.013702


In [27]:
# C'est le RMLSE sur "train" de la cross validation
result_model = pd.DataFrame(search.cv_results_)
result_model[result_model["rank_test_score"] == 1][["mean_train_score"]]

Unnamed: 0,mean_train_score
2,0.40587


In [29]:
search.best_score_ 

0.3273255831633457

In [30]:
pd.DataFrame(grid_ridge.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,50.719912,3.121417,0.165485,0.030068,0.1,{'alpha': 0.1},-0.034452,-0.014439,-0.017033,-0.011893,-0.0127,-0.018105,0.008363,4
1,50.297228,1.488643,0.157237,0.01119,0.5,{'alpha': 0.5},-0.028627,-0.012281,-0.015171,-0.010058,-0.009637,-0.015156,0.007018,3
2,47.793118,0.527379,0.148534,0.007421,1.0,{'alpha': 1},-0.02573,-0.010999,-0.014187,-0.00948,-0.008395,-0.013759,0.006297,2
3,48.133455,0.410139,0.14843,0.004248,5.0,{'alpha': 5},-0.020167,-0.008149,-0.01163,-0.008803,-0.006242,-0.010999,0.0049,1


In [52]:
# Avec : parameters = {'alpha':[1, 5, 10, 15]}
pd.DataFrame(grid_ridge.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,54.198833,2.107559,0.167053,0.031623,1,{'alpha': 1},-0.64511,-0.432378,-0.481538,-0.431428,...,-0.481857,0.08442,4,-0.22486,-0.23994,-0.235906,-0.240907,-0.241911,-0.236705,0.006264
1,54.704508,2.829823,0.160618,0.020526,5,{'alpha': 5},-0.54358,-0.383305,-0.434257,-0.407561,...,-0.423166,0.066708,3,-0.229373,-0.244364,-0.240696,-0.245573,-0.246859,-0.241373,0.006343
2,57.379488,3.427663,0.163011,0.006985,10,{'alpha': 10},-0.51691,-0.366144,-0.416298,-0.402085,...,-0.406013,0.063261,2,-0.2339,-0.248817,-0.245492,-0.250243,-0.251559,-0.246002,0.00638
3,58.637168,4.03178,0.157926,0.002475,15,{'alpha': 15},-0.506536,-0.358829,-0.408961,-0.400449,...,-0.399773,0.061479,1,-0.238247,-0.253132,-0.249924,-0.254637,-0.255914,-0.25037,0.006383


In [54]:
pd.DataFrame(grid_ridge.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,104.071278,4.602311,0.199423,0.043426,15,{'alpha': 15},-0.506536,-0.358829,-0.408961,-0.400449,...,-0.399773,0.061479,2,-0.238247,-0.253132,-0.249924,-0.254637,-0.255914,-0.25037,0.006383
1,103.017336,4.135794,0.193576,0.009865,25,{'alpha': 25},-0.49835,-0.354368,-0.404909,-0.401642,...,-0.396964,0.058801,1,-0.246784,-0.261598,-0.258293,-0.263095,-0.264232,-0.2588,0.006331
2,109.90165,3.348438,0.180359,0.003729,50,{'alpha': 50},-0.495119,-0.360822,-0.411739,-0.413347,...,-0.404196,0.05371,3,-0.266877,-0.281327,-0.277213,-0.282553,-0.283415,-0.278277,0.006084
3,114.419825,4.654415,0.207904,0.053241,100,{'alpha': 100},-0.501128,-0.384872,-0.434737,-0.441195,...,-0.426063,0.046837,4,-0.300092,-0.313435,-0.30764,-0.314142,-0.314998,-0.310061,0.005615


On soumet le modèle avec alpha = 25 qui semble etre le meilleur modèle

In [60]:
# C'est le RMLSE sur "train" de la cross validation
result_model = pd.DataFrame(grid_ridge.cv_results_)
result_model[result_model["rank_test_score"] == 1][["mean_train_score"]]

Unnamed: 0,mean_train_score
1,-0.2588


In [61]:
# C'est le RMLSE sur "test" de la cross validation
grid_ridge.best_score_ 
# Avec la métrique interne à Scikit learn, on avait : -0.010999027326119631, soit ~ 0.011 pour alpha = 5
# Cette fois avec alpha = 25: -0.39696411418593996

-0.39696411418593996

In [66]:
df_to_predict = pd.read_pickle("../data/df_test_kaggle_interaction_and_dummies.pkl") 
df_to_predict.shape
df_to_predict.head()

df_temp = df_to_predict.drop("datetime", axis = 1)

df_to_predict["count"] = list((np.exp(grid_ridge.predict(df_temp)) - 1)[:,0])

In [68]:
df_to_predict[["count"]].head()

Unnamed: 0,count
0,10.987431
1,5.739557
2,3.279754
3,2.806729
4,3.335188


In [69]:
df_to_predict[["datetime", "count"]].to_csv("../data/submission_to_send/pred_linear_ridge_regression_correct_cv.csv", index= False)

In [None]:
# ROOT MEAN SQUARED LOGARITHMIC ERROR : privilégier des prédictions au-dessous de la vérité

Linear regression + New : ON N'A PAS FAIT les features polynomiaux sur les var numériques