In [17]:
from sqlalchemy import create_engine
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv

def create_connection():

    load_dotenv()
    host = os.environ.get('DB_DESTINATION_HOST')
    port = os.environ.get('DB_DESTINATION_PORT')
    db = os.environ.get('DB_DESTINATION_NAME')
    username = os.environ.get('DB_DESTINATION_USER')
    password = os.environ.get('DB_DESTINATION_PASSWORD')

    conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    return conn

# устанавливаем соединение с базой
conn = create_connection()

In [18]:
data = pd.read_sql('select * from flats_buildings_clean', conn)
data.head()

Unnamed: 0,id,floor,is_apartment,kitchen_area,living_area,rooms,studio,total_area,price,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator
0,130436,10,False,7.5,19.1,1,False,36.599998,6300000,1988,4,55.876099,37.511383,2.64,406,17,True
1,104100,13,False,14.9,72.099998,3,False,131.300003,35200000,2009,2,55.785206,37.73579,3.0,213,17,True
2,130437,8,False,8.0,45.0,3,False,76.0,16000000,2004,4,55.869358,37.527641,2.74,256,17,True
3,130438,22,False,17.799999,15.5,1,False,42.299999,12200000,2022,2,55.835766,37.491871,2.8,264,25,True
4,130439,9,False,6.18,29.34,2,False,44.52,10560000,1974,4,55.870991,37.617527,2.64,357,9,True


In [19]:
data.drop(columns=['id'], inplace=True)
data.head()

Unnamed: 0,floor,is_apartment,kitchen_area,living_area,rooms,studio,total_area,price,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator
0,10,False,7.5,19.1,1,False,36.599998,6300000,1988,4,55.876099,37.511383,2.64,406,17,True
1,13,False,14.9,72.099998,3,False,131.300003,35200000,2009,2,55.785206,37.73579,3.0,213,17,True
2,8,False,8.0,45.0,3,False,76.0,16000000,2004,4,55.869358,37.527641,2.74,256,17,True
3,22,False,17.799999,15.5,1,False,42.299999,12200000,2022,2,55.835766,37.491871,2.8,264,25,True
4,9,False,6.18,29.34,2,False,44.52,10560000,1974,4,55.870991,37.617527,2.64,357,9,True


In [20]:
# 📦 Импорты
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders import CatBoostEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from geopy.distance import geodesic

# 🧠 Feature Engineering
data = data.copy()

# Новые признаки
data['relative_floor'] = data['floor'] / data['floors_total']
data['is_top_floor'] = data['floor'] == data['floors_total']
data['building_age'] = 2025 - data['build_year']
data['kitchen_ratio'] = data['kitchen_area'] / data['total_area']
data['flats_per_floor'] = data['flats_count'] / data['floors_total']

# Расстояние до центра Москвы
moscow_center = (55.7558, 37.6173)
data['dist_to_center'] = data.apply(
    lambda row: geodesic((row['latitude'], row['longitude']), moscow_center).km, axis=1
)

# 🎯 Целевая переменная
data['price_log'] = np.log1p(data['price'])

# 📤 Разделение данных
x = data.drop(columns=['price', 'price_log'])
y = data['price_log']
price_bins = pd.qcut(data['price'], q=10)

x_tr, x_val, y_tr, y_val = train_test_split(
    x, y,
    test_size=0.2,
    stratify=price_bins,
    random_state=42
)

original_y_val = data.loc[x_val.index, 'price']

# 📊 Признаки по типам
num_features_tr = x_tr.select_dtypes(include=['float', 'int']).drop(columns=['building_type_int'])
cat_features_tr = x_tr[['building_type_int']]
binary_cat_features_tr = x_tr.select_dtypes(include='bool')

num_cols = num_features_tr.columns.tolist()
non_binary_cat_cols = cat_features_tr.columns.tolist()
binary_cols = binary_cat_features_tr.columns.tolist()

# 🧱 Обёртка для CatBoostEncoder
class CatBoostEncoderWrapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = CatBoostEncoder()
        self.feature_names = None

    def fit(self, X, y=None):
        self.encoder.fit(X, y)
        self.feature_names = X.columns.tolist()
        return self

    def transform(self, X):
        return self.encoder.transform(X)

    def get_feature_names_out(self, input_features=None):
        return self.feature_names

# 🔧 Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', OneHotEncoder(drop='if_binary', sparse_output=False), binary_cols),
        ('non_binary', CatBoostEncoderWrapper(), non_binary_cat_cols),
        ('scaler', StandardScaler(), num_cols)
    ],
    verbose_feature_names_out=False
)

# 🧠 Модель
model = CatBoostRegressor(
    verbose=100,
    random_seed=42
)

# 🔁 Пайплайн
pipeline = Pipeline(
    [
        ('preprocessing', preprocessor),
        ('model', model)
    ]
)

# 🚀 Обучение
pipeline.fit(x_tr, y_tr)

# 🔮 Предсказание
y_pred = pipeline.predict(x_val)
y_pred = np.expm1(y_pred)

# 📏 Метрики
print('MAE:', mean_absolute_error(original_y_val, y_pred))
print('RMSE:', mean_squared_error(original_y_val, y_pred, squared=False)) 
print('R²:', r2_score(original_y_val, y_pred))


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  return pd.api.types.is_categorical_dtype(dtype)
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Learning rate set to 0.086421
0:	learn: 0.6480243	total: 64.5ms	remaining: 1m 4s
100:	learn: 0.3169834	total: 4.06s	remaining: 36.1s
200:	learn: 0.3089035	total: 7.89s	remaining: 31.4s
300:	learn: 0.3027066	total: 9.76s	remaining: 22.7s
400:	learn: 0.2979636	total: 11.5s	remaining: 17.1s
500:	learn: 0.2930979	total: 13.2s	remaining: 13.2s
600:	learn: 0.2888899	total: 15.1s	remaining: 10s
700:	learn: 0.2852962	total: 16.9s	remaining: 7.19s
800:	learn: 0.2819678	total: 18.7s	remaining: 4.66s
900:	learn: 0.2788300	total: 20.5s	remaining: 2.25s
999:	learn: 0.2757244	total: 22.4s	remaining: 0us
MAE: 4169416.0230503236
RMSE: 45799876.39714464
R²: 0.371257432189982


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
