In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
# Подгружаем готовый датасет  

df = pd.read_csv('../realises3/team_A_data.csv')
df.columns

Index(['ad_id', 'rooms_count', 'area_m2', 'ceiling_height_m', 'price_rub',
       'metro_distance_min', 'floors', 'passenger_elevator', 'cargo_elevator',
       'bathroom_combined', 'bathroom_separate', 'balcony', 'loggia',
       'price_per_metr', 'apartment_density', 'near_metro', 'parking_encoded',
       'type_house_encoded', 'renovation_Дизайнерский',
       'renovation_Евроремонт', 'renovation_Косметический', 'windows_На улицу',
       'windows_На улицу и двор', 'garbage_chute_Нет', 'district_encoded',
       'children_allowed_Нет', 'pets_allowed_Нет', 'premium_apartment'],
      dtype='object')

In [3]:
# Инфо

df.price_per_metr.describe()

count    19733.000000
mean      1222.757310
std        617.196287
min          0.000000
25%        875.000000
50%       1041.000000
75%       1360.000000
max       9493.000000
Name: price_per_metr, dtype: float64

### Нормализация

In [4]:
# Нормализируем через скейлер, выбираем признаки для нормализации

features = ['area_m2', 'metro_distance_min', 
            'cargo_elevator', 'passenger_elevator']

scaler = StandardScaler()

normalized = scaler.fit_transform(df[features])

In [5]:
# Сливаем в один датасет убираем лишнее

normalized_df = pd.DataFrame(normalized, columns=features, index=df.index)
df_normalized = df.drop(columns=features).join(normalized_df)

In [6]:
df_normalized.head(5)

Unnamed: 0,ad_id,rooms_count,ceiling_height_m,price_rub,floors,bathroom_combined,bathroom_separate,balcony,loggia,price_per_metr,...,windows_На улицу и двор,garbage_chute_Нет,district_encoded,children_allowed_Нет,pets_allowed_Нет,premium_apartment,area_m2,metro_distance_min,cargo_elevator,passenger_elevator
0,271271157,4,3.0,500000,16,0,0,0,0,2500,...,False,False,6,False,False,0,2.918437,-0.034852,1.059761,3.033034
1,271634126,4,3.5,500000,16,2,1,0,0,2525,...,True,True,6,False,True,0,2.875717,-0.192915,1.059761,-0.242576
2,271173086,4,3.2,500000,16,3,0,0,0,2500,...,True,True,6,False,True,0,2.918437,-0.350979,-0.581961,-0.242576
3,272197456,4,3.2,400000,6,3,0,0,0,2352,...,True,True,6,True,False,0,2.277639,-0.983233,-0.581961,-0.242576
4,273614615,2,3.9,225000,26,2,0,0,0,3879,...,True,False,6,True,True,0,-0.114677,-0.350979,1.059761,-0.242576


### Обучение

In [7]:
df_normalized.drop(columns=['ad_id', 'price_per_metr', 'apartment_density'], inplace=True)

In [8]:
# X - признаки  Y- целевая переменная

x = df_normalized.drop(columns='price_rub')
y = df_normalized['price_rub']

In [9]:
# Разделяем датасет

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [10]:
# Создание модели обучение и предсказание

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [11]:
# Анализ нейронки

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.2f}")
print(f"R²: {r2:.4f}")

MSE: 3351589644.85
R²: 0.8139


In [12]:
# Анализ нейронки

import numpy as np

rmse = np.sqrt(349783030.68)
print(f"RMSE: {rmse:.2f}")

RMSE: 18702.49


In [13]:
# Сохраняем модель и скалер
import joblib

joblib.dump(scaler, 'scaler.pkl')
joblib.dump(model, 'apartment_model.pkl', compress=3)

['apartment_model.pkl']

In [14]:
df_normalized.to_csv('data_normalized.csv')