In [9]:
from catboost import CatBoostRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score,root_mean_squared_error
from sklearn.model_selection import GridSearchCV
from datetime import datetime

Считаем данные, для ускорения обучения возьмем первые 500000 записей

In [10]:
data_X = pd.read_pickle('data_X.pkl.gz',)
data_y = pd.read_pickle('data_y.pkl.gz')
data_X = data_X[:500000]
data_y = data_y[:500000]
data_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       500000 non-null  float32
 1   1       500000 non-null  float32
 2   2       500000 non-null  float32
 3   3       500000 non-null  float32
 4   4       500000 non-null  float32
 5   5       500000 non-null  float32
 6   6       500000 non-null  float32
 7   10      500000 non-null  float32
 8   11      500000 non-null  float32
 9   13      500000 non-null  float32
 10  16      500000 non-null  float32
 11  20      500000 non-null  float32
 12  21      500000 non-null  float32
 13  25      500000 non-null  float32
 14  26      500000 non-null  float32
dtypes: float32(15)
memory usage: 28.6 MB


Далее разобьем нашу выборку на обучающую и тестовую выборку

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=42)

Первым из алгоритмов мы взглянем на RandomForestRegressor

In [28]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42, verbose=1,n_jobs=-1)
start_time = datetime.now()
rf_regressor.fit(X_train, y_train)
end_time = datetime.now() 
# Прогнозирование на тестовой выборке
y_pred = rf_regressor.predict(X_test)

# Оценка модели
mse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Duration: {}'.format(end_time - start_time))
print(f"Root Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")


  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.3s


Duration: 0:01:57.836109
Root Mean Squared Error: 687.70
R^2 Score: 0.66


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.8s finished


Следующий на очереди CatBoostRegressor

In [29]:
model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=10, verbose=0)
start_time = datetime.now()
model.fit(X_train, y_train)
end_time = datetime.now() 
# Прогнозирование на тестовой выборке
y_pred = model.predict(X_test)

# Оценка модели
mse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
    
print('Duration: {}'.format(end_time - start_time))
print(f" Root Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Duration: 0:00:05.476436
 Root Mean Squared Error: 675.27
R^2 Score: 0.67


Далее мы посмотрим на XGBRegressor

In [30]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
start_time = datetime.now()
model.fit(X_train, y_train)
end_time = datetime.now()   
# Прогнозирование на тестовой выборке
y_pred = model.predict(X_test)

# Оценка модели
mse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Duration: {}'.format(end_time - start_time))
print(f"Root Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Duration: 0:00:00.789867
Root Mean Squared Error: 676.87
R^2 Score: 0.67


По итогу в пайплайн у нас попадает XGBRegressor, за счет своей скорости и показателя R^2 который не отличается от catboost

In [31]:
def pipeline(X_path, y_path, X_path_val, y_path_val):
    data_X = pd.read_pickle(X_path)
    data_y = pd.read_pickle(y_path)
    data_X_val = pd.read_pickle(X_path_val)
    data_y_val = pd.read_pickle(y_path_val)
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(data_X, data_y)
    y_pred = model.predict(data_X_val)

    # Оценка модели
    rmse = root_mean_squared_error(data_y_val, y_pred)
    r2 = r2_score(data_y_val, y_pred)
    
    return model,[{'R2': r2, 'RMSE': rmse}]
    

Запустим наш пайплайн

In [32]:
result = pipeline('data_X.pkl.gz', 'data_y.pkl.gz', 'data_X_val.pkl.gz', 'data_y_val.pkl.gz')
result

(XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=3, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...),
 [{'R2': -0.6415066719055176, 'RMSE': 1513.1239013671875}])

Увы, но итоговая точность оставляет желать лучшего