In [57]:
from sqlalchemy import create_engine
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv

def create_connection():

    load_dotenv()
    host = os.environ.get('DB_DESTINATION_HOST')
    port = os.environ.get('DB_DESTINATION_PORT')
    db = os.environ.get('DB_DESTINATION_NAME')
    username = os.environ.get('DB_DESTINATION_USER')
    password = os.environ.get('DB_DESTINATION_PASSWORD')

    print(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    return conn

# устанавливаем соединение с базой
conn = create_connection()

postgresql://mle_20250529_e59a5780ac_freetrack:da358a544c2540e9a3115cd6c97d06de@rc1b-uh7kdmcx67eomesf.mdb.yandexcloud.net:6432/playground_mle_20250529_e59a5780ac


In [58]:
data = pd.read_sql('select * from flats_buildings_clean', conn)
data.head()

Unnamed: 0,id,floor,is_apartment,kitchen_area,living_area,rooms,studio,total_area,price,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator
0,86105,17,False,11.0,23.0,1,False,49.0,12700000,2015,2,55.840538,37.493614,3.0,169,26,True
1,86106,10,False,10.0,18.0,1,False,34.200001,6250000,2002,4,55.663036,37.77943,2.75,48,10,True
2,86107,15,False,13.0,44.0,3,False,75.0,11000000,2013,4,55.700695,37.92239,2.74,320,17,True
3,86108,9,False,6.0,37.0,3,False,50.0,9300000,1967,4,55.859295,37.495724,2.64,210,9,True
4,86109,8,False,9.0,38.0,2,False,52.099998,11900000,1970,1,55.679722,37.548771,2.7,112,14,True


In [59]:
data.drop(columns=['id'], inplace=True)
data.head()

Unnamed: 0,floor,is_apartment,kitchen_area,living_area,rooms,studio,total_area,price,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator
0,17,False,11.0,23.0,1,False,49.0,12700000,2015,2,55.840538,37.493614,3.0,169,26,True
1,10,False,10.0,18.0,1,False,34.200001,6250000,2002,4,55.663036,37.77943,2.75,48,10,True
2,15,False,13.0,44.0,3,False,75.0,11000000,2013,4,55.700695,37.92239,2.74,320,17,True
3,9,False,6.0,37.0,3,False,50.0,9300000,1967,4,55.859295,37.495724,2.64,210,9,True
4,8,False,9.0,38.0,2,False,52.099998,11900000,1970,1,55.679722,37.548771,2.7,112,14,True


In [64]:
# Разделение данных
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders import CatBoostEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

x = data.drop(columns='price')
y = data['price']

# Цена - непрерывный числовой показатель, нужно произвести бинирование - 
# разбиение цен на несколько групп с примерно одинаковым количеством объектов
# в нашем случае выделяем 10 групп и предотвращаем их повторение - duplicates='drop'
price_bins = pd.qcut(y, q=10)

x_tr, x_val, y_tr, y_val = train_test_split(
    x, y,
    test_size=0.2,
    stratify=price_bins,
    random_state=42
)

# Тренировочная выборка
num_features_tr = x_tr.select_dtypes(include=['float', 'int']).drop(columns=['building_type_int'])
cat_features_tr = x_tr[['building_type_int']]
binary_cat_features_tr = x_tr.select_dtypes(include='bool')

# Валидационная выборка
num_features_val = x_val.select_dtypes(include=['float', 'int']).drop(columns=['building_type_int'])
cat_features_val = x_val[['building_type_int']]
binary_cat_features_val = x_val.select_dtypes(include='bool')

binary_cols = binary_cat_features_tr.columns.tolist()
non_binary_cat_cols = cat_features_tr.columns.tolist()
num_cols = num_features_tr.columns.tolist()

# Обертка для CatBoostEncoder
class CatBoostEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = CatBoostEncoder()
        self.feature_names = None

    def fit(self, X, y=None):
        self.encoder.fit(X, y)
        self.feature_names = X.columns.tolist()
        return self

    def transform(self, X):
        return self.encoder.transform(X)

    def get_feature_names_out(self, input_features=None):
        return self.feature_names

# определите список трансформаций в рамках ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', OneHotEncoder(drop='if_binary', sparse_output=False), binary_cols),
        ('non_binary', CatBoostEncoderWrapper(), non_binary_cat_cols),
        ('scaler', StandardScaler(), num_cols)
    ],
    verbose_feature_names_out=False
)

# трансформируйте исходные данные data с помощью созданного preprocessor
x_tr_transformed = preprocessor.fit_transform(x_tr, y_tr)
x_val_transformed = preprocessor.transform(x_val)


model = CatBoostRegressor(
    verbose=100,
    random_seed=42
)

# Пайплайн
pipeline = Pipeline(
    [
        ('preprocessing', preprocessor),
        ('model', model)
    ]
)

# Обучение пайплайна
pipeline.fit(x_tr, y_tr)

# Предсказания
y_pred = pipeline.predict(x_val)


print('MAE:', mean_absolute_error(y_val, y_pred))
print('RMSE:', mean_squared_error(y_val, y_pred, squared=False)) 
print('R²:', r2_score(y_val, y_pred))



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  elif isinstance(cols, tuple):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  elif isinstance(cols, tuple):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype

Learning rate set to 0.086421
0:	learn: 70376413.8320816	total: 14.3ms	remaining: 14.3s
100:	learn: 60373985.1486394	total: 1.5s	remaining: 13.3s
200:	learn: 56016886.8724888	total: 2.95s	remaining: 11.7s
300:	learn: 51189571.7584705	total: 4.44s	remaining: 10.3s
400:	learn: 48140823.1825326	total: 7.26s	remaining: 10.8s
500:	learn: 45358612.3870840	total: 10.3s	remaining: 10.2s
600:	learn: 41725163.9371096	total: 11.8s	remaining: 7.85s
700:	learn: 39840485.6531369	total: 13.7s	remaining: 5.84s
800:	learn: 38344490.6209987	total: 15.7s	remaining: 3.9s
900:	learn: 37252916.6446718	total: 17.2s	remaining: 1.89s
999:	learn: 36238536.1468143	total: 18.8s	remaining: 0us
MAE: 4645647.365609936
RMSE: 29680043.58369892
R²: 0.43085404572438135


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
