In [1]:
import numpy as np
import pandas as pd


from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

**Формулировка на простом языке:** 
> Необходимо сделать предсказание количества совершаемых поездок в зависимости от погодных условий.

**Формулировка на математическом языке:** 
> Ставится задача регрессии для количества совершаемых поездок в зависимости от погодных условй.

In [2]:
DATA_DIR = "../data/processed/"

In [3]:
def get_data(DATA_DIR):
    trips = pd.read_csv(DATA_DIR+"trips.csv", 
                    error_bad_lines=False, 
                    index_col=0)
    weather = pd.read_csv(DATA_DIR+"weather.csv")
    trips['starttime'] = pd.to_datetime(trips['starttime'])
    trips['stoptime'] = pd.to_datetime(trips['stoptime'])
    trips['Date'] = pd.to_datetime(trips['starttime'].dt.date)
    weather['Date']= pd.to_datetime(weather['Date'])
    
    num_trips_per_day = trips.groupby('Date').size().reset_index().rename(columns={0:'trips_counter'})
    data = num_trips_per_day.merge(weather, on='Date')
    
    data.Events = data.Events.fillna('Nothing')
    data = data.drop('Date', axis=1)
    data = data.drop('Max_Gust_Speed_MPH', axis=1) # много пропущенных значений
    for col in data.columns[1:]:
        data[col] = data[col].fillna(0)
    return data

Выберем RMSE в качестве метрики(имеет ту же размерность, что и исходные данные, в отличие от MSE).

In [4]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
data = get_data(DATA_DIR)
data.head(5)

Unnamed: 0,trips_counter,Max_Temperature_F,Mean_Temperature_F,Min_TemperatureF,Max_Dew_Point_F,MeanDew_Point_F,Min_Dewpoint_F,Max_Humidity,Mean_Humidity,Min_Humidity,Max_Sea_Level_Pressure_In,Mean_Sea_Level_Pressure_In,Min_Sea_Level_Pressure_In,Max_Visibility_Miles,Mean_Visibility_Miles,Min_Visibility_Miles,Max_Wind_Speed_MPH,Mean_Wind_Speed_MPH,Precipitation_In,Events
0,409,71,62.0,54,55,51,46,87,68,46,30.03,29.79,29.65,10,10,4,13,4,0.0,Rain
1,491,63,59.0,55,52,51,50,88,78,63,29.84,29.75,29.54,10,9,3,10,5,0.11,Rain
2,313,62,58.0,54,53,50,46,87,77,67,29.98,29.71,29.51,10,9,3,18,7,0.45,Rain
3,395,71,61.0,52,49,46,42,83,61,36,30.03,29.95,29.81,10,10,10,9,4,0.0,Rain
4,294,64,60.0,57,55,51,41,87,72,46,29.83,29.78,29.73,10,10,6,8,3,0.14,Rain


In [6]:
data = pd.concat((data.iloc[:, :-1], pd.get_dummies(data.iloc[:, -1])),axis=1)

Попробуем применить линейную регресиию, но не простую, а Elastic Net.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], 
                                                    data.iloc[:, 0], 
                                                    test_size=0.2, 
                                                    random_state=44)

Посмотрим на размер тренировочной/тестовой выборки.

In [8]:
X_train.shape, X_test.shape

((551, 28), (138, 28))

In [9]:
model = ElasticNet(max_iter=5000)
steps = [('scaler', StandardScaler()), ('model', model)] 
pipeline = Pipeline(steps)

In [10]:
parameters_grid_elastic = {
    'model__alpha' : np.linspace(0.00001, 2, num=100),
    'model__l1_ratio' : np.linspace(0, 1, num=50)
}

In [11]:
ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=44)

In [12]:
gs = GridSearchCV(estimator=pipeline, 
                  param_grid=parameters_grid_elastic, 
                  scoring='neg_mean_squared_error', 
                  cv=ss, 
                  n_jobs=-1,
                  verbose=2)

In [13]:
gs.fit(X_train, y_train)

Fitting 5 folds for each of 5000 candidates, totalling 25000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 1556 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 6084 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done 11924 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 19044 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 25000 out of 25000 | elapsed:  2.5min finished


GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=44, test_size=0.25, train_size=None),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('model',
                                        ElasticNet(alpha=1.0, copy_X=True,
                                                   fit_intercept=True,
                                                   l1_ratio=0.5, max_iter=5000,
                                                   normalize=False,
                                                   positive=False,
                                                   precompute=False,
                                                   random...
       0.51020408, 0.53061224, 0.5

In [14]:
root_mean_squared_error(y_test, gs.best_estimator_.predict(X_test))

75.65636238907558

Получили результат близкий к тем, что мы получали у CatBoost/RandomForest, но все таки хуже, хотя мы и применял подбор параметров.