In [16]:
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
import yaml

def create_connection():

    load_dotenv()
    host = os.environ.get('DB_DESTINATION_HOST')
    port = os.environ.get('DB_DESTINATION_PORT')
    db = os.environ.get('DB_DESTINATION_NAME')
    username = os.environ.get('DB_DESTINATION_USER')
    password = os.environ.get('DB_DESTINATION_PASSWORD')
    
    print(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})
    return conn


def get_data():
    #with open('params.yaml', 'r') as fd:
        #params = yaml.safe_load(fd)

    conn = create_connection()
    data = pd.read_sql('select * from clean_flat_data', conn, index_col='flat_id')
    conn.dispose()

    #os.makedirs('data', exist_ok=True)
    #data.to_csv('data/initial_data.csv', index=None)
    return data


In [17]:
data = get_data()
data.shape

postgresql://mle_20240528_f5d3323004:ff08c64f3b0446c6bccc577a89ea0fde@rc1b-uh7kdmcx67eomesf.mdb.yandexcloud.net:6432/playground_mle_20240528_f5d3323004


(95560, 16)

In [7]:
cat_cols = ['building_type_int', 'has_elevator', 'is_apartment']
cat_features = data[cat_cols]
cat_features.head()

Unnamed: 0_level_0,building_type_int,has_elevator,is_apartment
flat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,6,1,0
1.0,2,1,0
2.0,4,1,0
4.0,1,1,0
5.0,4,1,0


In [6]:
data.nunique()

build_year             103
building_type_int        6
latitude             14225
longitude            13887
ceiling_height          34
flats_count            588
floors_total            29
has_elevator             2
floor                   20
kitchen_area           521
living_area           1339
rooms                    5
is_apartment             2
studio                   1
total_area            1742
price                 3774
dtype: int64

In [19]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostRegressor
#import yaml
#import os
#import joblib

# обучение модели

# Прочитайте файл с гиперпараметрами params.yaml
#with open('params.yaml', 'r') as fd:
    #params = yaml.safe_load(fd)
    
# загрузите результат предыдущего шага: inital_data.csv
#data = pd.read_csv('data/initial_data.csv')

# реализуйте основную логику шага с использованием гиперпараметров
y = data['price']

data.drop(columns=['studio', 'price'], inplace=True)
cat_cols = ['building_type_int', 'has_elevator', 'is_apartment']
cat_features = data[cat_cols]
potential_binary_features = cat_features.nunique() == 2

binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
other_cat_features = cat_features[potential_binary_features[~potential_binary_features].index]
num_features = data.drop(cat_cols, axis=1)

preprocessor = ColumnTransformer(
    [
    ('binary', OneHotEncoder(drop='if_binary'), binary_cat_features.columns.tolist()),
    ('nonbinary', CatBoostEncoder(), other_cat_features.columns.tolist()),
    ('num', StandardScaler(), num_features.columns.tolist())
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

model = CatBoostRegressor(loss_function='RMSE')

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)
pipeline.fit(data, y) 

# сохраните обученную модель в models/fitted_model.pkl
#os.makedirs('models', exist_ok=True) # создание директории, если её ещё нет
#with open('models/fitted_model.pkl', 'wb') as fd:
    #joblib.dump(pipeline, fd) 



Learning rate set to 0.084152
0:	learn: 4304861.4045879	total: 63.4ms	remaining: 1m 3s
1:	learn: 4124160.9835376	total: 78ms	remaining: 38.9s
2:	learn: 3971313.2216420	total: 91.6ms	remaining: 30.4s
3:	learn: 3835133.8163829	total: 104ms	remaining: 26s
4:	learn: 3718393.9856965	total: 119ms	remaining: 23.6s
5:	learn: 3609870.4669150	total: 135ms	remaining: 22.3s
6:	learn: 3518653.7128901	total: 165ms	remaining: 23.4s
7:	learn: 3438710.1291356	total: 192ms	remaining: 23.9s
8:	learn: 3370481.2662232	total: 207ms	remaining: 22.7s
9:	learn: 3307593.0265829	total: 220ms	remaining: 21.8s
10:	learn: 3248597.3533418	total: 234ms	remaining: 21s
11:	learn: 3190744.1000250	total: 247ms	remaining: 20.3s
12:	learn: 3146897.3113967	total: 262ms	remaining: 19.9s
13:	learn: 3095186.9048724	total: 276ms	remaining: 19.4s
14:	learn: 3057132.9158235	total: 289ms	remaining: 19s
15:	learn: 3022218.3971062	total: 302ms	remaining: 18.6s
16:	learn: 2985254.3169270	total: 315ms	remaining: 18.2s
17:	learn: 29538

In [18]:
data.columns

Index(['build_year', 'building_type_int', 'latitude', 'longitude',
       'ceiling_height', 'flats_count', 'floors_total', 'has_elevator',
       'floor', 'kitchen_area', 'living_area', 'rooms', 'is_apartment',
       'studio', 'total_area', 'price'],
      dtype='object')

In [20]:
model

<catboost.core.CatBoostRegressor at 0x7f2637749330>

In [22]:
import pandas as pd
from sklearn.model_selection import KFold, cross_validate

    
# реализуйте основную логику шага с использованием прочтённых гиперпараметров
cv_strategy = KFold(n_splits=5)
cv_res = cross_validate(
    pipeline,
    data,
    y,
    cv=cv_strategy,
    n_jobs=-1,
    scoring=['neg_mean_absolute_percentage_error']
    )
for key, value in cv_res.items():
    cv_res[key] = round(value.mean(), 3) 
    
# сохраните результата кросс-валидации в cv_res.json
#os.makedirs('cv_results', exist_ok=True)
#with open('cv_results/cv_res.json', 'w') as fd:
    #json.dump(cv_res, fd) 

Learning rate set to 0.081237
Learning rate set to 0.081237
0:	learn: 4346913.4154452	total: 67.8ms	remaining: 1m 7s
1:	learn: 4179030.2251748	total: 79.3ms	remaining: 39.6s
2:	learn: 4026735.0443927	total: 91ms	remaining: 30.2s
3:	learn: 3889368.2700684	total: 102ms	remaining: 25.5s
4:	learn: 3765781.7814772	total: 120ms	remaining: 23.8s
0:	learn: 4365424.4330052	total: 76ms	remaining: 1m 15s
5:	learn: 3660966.9920058	total: 146ms	remaining: 24.2s
1:	learn: 4192895.1813251	total: 105ms	remaining: 52.5s
6:	learn: 3569015.7291029	total: 173ms	remaining: 24.5s
2:	learn: 4043520.8494245	total: 135ms	remaining: 44.7s
7:	learn: 3487746.7185465	total: 201ms	remaining: 24.9s
3:	learn: 3901281.3940641	total: 161ms	remaining: 40.1s
8:	learn: 3413950.7759776	total: 230ms	remaining: 25.3s
4:	learn: 3780926.1443950	total: 189ms	remaining: 37.6s
9:	learn: 3349630.4242995	total: 258ms	remaining: 25.5s
5:	learn: 3671554.1334038	total: 219ms	remaining: 36.3s
10:	learn: 3290551.7968813	total: 286ms	rem

In [23]:
cv_res

{'fit_time': 24.186,
 'score_time': 0.166,
 'test_neg_mean_absolute_percentage_error': -18.835}