# Определение стоимости автомобилей

Сервис по продаже автомобилей с пробегом «Не бит, не крашен» разрабатывает приложение для привлечения новых клиентов. В нём можно быстро узнать рыночную стоимость своего автомобиля. В вашем распоряжении исторические данные: технические характеристики, комплектации и цены автомобилей. Вам нужно построить модель для определения стоимости. 

Заказчику важны:

- качество предсказания;
- скорость предсказания.

## Подготовка данных

Загрузим все необходимые нам библиотеки.

In [1]:
#!pip install lightgbm

In [48]:
import pandas as pd 
import numpy as np
import catboost as cb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
import lightgbm as lgbm
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

Откроем файл, изучим всю информацию по нему.

In [3]:
try:
    df = pd.read_csv('autos.csv')
except:
    df = pd.read_csv('/datasets/autos.csv')

In [4]:
df.head(10)

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21
5,2016-04-04 17:36:23,650,sedan,1995,manual,102,3er,150000,10,petrol,bmw,yes,2016-04-04 00:00:00,0,33775,2016-04-06 19:17:07
6,2016-04-01 20:48:51,2200,convertible,2004,manual,109,2_reihe,150000,8,petrol,peugeot,no,2016-04-01 00:00:00,0,67112,2016-04-05 18:18:39
7,2016-03-21 18:54:38,0,sedan,1980,manual,50,other,40000,7,petrol,volkswagen,no,2016-03-21 00:00:00,0,19348,2016-03-25 16:47:58
8,2016-04-04 23:42:13,14500,bus,2014,manual,125,c_max,30000,8,petrol,ford,,2016-04-04 00:00:00,0,94505,2016-04-04 23:42:13
9,2016-03-17 10:53:50,999,small,1998,manual,101,golf,150000,0,,volkswagen,,2016-03-17 00:00:00,0,27472,2016-03-31 17:17:06


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        354369 non-null  object
 1   Price              354369 non-null  int64 
 2   VehicleType        316879 non-null  object
 3   RegistrationYear   354369 non-null  int64 
 4   Gearbox            334536 non-null  object
 5   Power              354369 non-null  int64 
 6   Model              334664 non-null  object
 7   Kilometer          354369 non-null  int64 
 8   RegistrationMonth  354369 non-null  int64 
 9   FuelType           321474 non-null  object
 10  Brand              354369 non-null  object
 11  Repaired           283215 non-null  object
 12  DateCreated        354369 non-null  object
 13  NumberOfPictures   354369 non-null  int64 
 14  PostalCode         354369 non-null  int64 
 15  LastSeen           354369 non-null  object
dtypes: int64(7), object(

In [6]:
df.describe()

Unnamed: 0,Price,RegistrationYear,Power,Kilometer,RegistrationMonth,NumberOfPictures,PostalCode
count,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0,354369.0
mean,4416.656776,2004.234448,110.094337,128211.172535,5.714645,0.0,50508.689087
std,4514.158514,90.227958,189.850405,37905.34153,3.726421,0.0,25783.096248
min,0.0,1000.0,0.0,5000.0,0.0,0.0,1067.0
25%,1050.0,1999.0,69.0,125000.0,3.0,0.0,30165.0
50%,2700.0,2003.0,105.0,150000.0,6.0,0.0,49413.0
75%,6400.0,2008.0,143.0,150000.0,9.0,0.0,71083.0
max,20000.0,9999.0,20000.0,150000.0,12.0,0.0,99998.0


In [7]:
df.isna().sum()

DateCrawled              0
Price                    0
VehicleType          37490
RegistrationYear         0
Gearbox              19833
Power                    0
Model                19705
Kilometer                0
RegistrationMonth        0
FuelType             32895
Brand                    0
Repaired             71154
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
dtype: int64

In [8]:
df.duplicated().sum()

4

Для начала удалим дубликаты и сбросим индексы.

In [9]:
df.drop_duplicates(inplace=True, ignore_index=True)
df.duplicated().sum()

0

Узнаем количество нулевых значений в целевом признаке и удалим их, поскольку цена проданных автомобилей по определению не может равняться нулю. Скорее всего, здесь произошла ошибка при выгрузке данных, либо ошибка была допущена человеком.

In [10]:
df[df['Price']==0]['Price'].count()

10772

In [11]:
df['Price'].value_counts()

0        10772
500       5670
1500      5394
1000      4648
1200      4594
         ...  
1368         1
233          1
11080        1
16340        1
10985        1
Name: Price, Length: 3731, dtype: int64

In [12]:
df = df[df['Price'] > 0]
df.reset_index(drop=True, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343593 entries, 0 to 343592
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        343593 non-null  object
 1   Price              343593 non-null  int64 
 2   VehicleType        309843 non-null  object
 3   RegistrationYear   343593 non-null  int64 
 4   Gearbox            326238 non-null  object
 5   Power              343593 non-null  int64 
 6   Model              326072 non-null  object
 7   Kilometer          343593 non-null  int64 
 8   RegistrationMonth  343593 non-null  int64 
 9   FuelType           314134 non-null  object
 10  Brand              343593 non-null  object
 11  Repaired           277727 non-null  object
 12  DateCreated        343593 non-null  object
 13  NumberOfPictures   343593 non-null  int64 
 14  PostalCode         343593 non-null  int64 
 15  LastSeen           343593 non-null  object
dtypes: int64(7), object(

Теперь обработаем пропуски в наших признаках. Поскольку у нас нет возможности достать пропущенные данные, заменим все пропуски в наших признаках на *'unknown'*.

In [13]:
df.fillna('unknown', inplace=True)
df.isna().sum()

DateCrawled          0
Price                0
VehicleType          0
RegistrationYear     0
Gearbox              0
Power                0
Model                0
Kilometer            0
RegistrationMonth    0
FuelType             0
Brand                0
Repaired             0
DateCreated          0
NumberOfPictures     0
PostalCode           0
LastSeen             0
dtype: int64

Теперь обработаем выбросы в столбцах *RegistrationYear* и *Power*.

In [14]:
df['RegistrationYear'].unique()

array([1993, 2011, 2004, 2001, 2008, 1995, 2014, 1998, 2005, 1910, 2016,
       2007, 2009, 2002, 2018, 1997, 1990, 2017, 1981, 2003, 1994, 1991,
       1984, 2006, 1999, 2012, 2010, 2000, 1992, 2013, 1996, 1985, 1989,
       2015, 1982, 1976, 1983, 1973, 1969, 1971, 1987, 1986, 1988, 1980,
       1970, 1965, 1945, 1925, 1974, 1979, 1955, 1978, 1972, 1968, 1977,
       1961, 1966, 1975, 1963, 1964, 1960, 5000, 1958, 1967, 1959, 1956,
       3200, 1000, 1941, 9999, 8888, 1500, 2200, 4100, 1962, 1929, 1957,
       1940, 3000, 2066, 1949, 2019, 1937, 1951, 1800, 1953, 1954, 1234,
       8000, 5300, 9000, 2900, 6000, 5900, 5911, 1400, 1950, 4000, 1948,
       1952, 8500, 1932, 1255, 3700, 3800, 4800, 1942, 7000, 1935, 1933,
       1936, 6500, 1923, 2290, 1930, 1001, 9450, 1944, 2500, 1943, 1934,
       1938, 1928, 5555, 5600, 1600, 1111, 2222, 1039, 1300, 2800, 1931,
       4500, 1602, 7800, 1947, 1927, 7100, 8200, 1946], dtype=int64)

In [15]:
df.sort_values(by='DateCrawled', ascending=False)

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
87942,2016-04-07 14:36:58,3500,sedan,1999,manual,143,e_klasse,150000,8,gasoline,mercedes_benz,no,2016-04-07 00:00:00,0,64846,2016-04-07 14:36:58
329308,2016-04-07 14:36:56,8650,suv,1992,manual,121,wrangler,125000,5,petrol,jeep,no,2016-04-07 00:00:00,0,20357,2016-04-07 14:36:56
291831,2016-04-07 14:36:55,15200,wagon,2008,manual,190,a6,150000,9,gasoline,audi,no,2016-04-07 00:00:00,0,81476,2016-04-07 14:36:55
52092,2016-04-07 14:36:54,400,small,1996,manual,60,ibiza,150000,0,unknown,seat,unknown,2016-04-07 00:00:00,0,66299,2016-04-07 14:36:54
197952,2016-04-07 14:36:53,6990,coupe,1979,auto,143,3er,150000,6,petrol,bmw,no,2016-04-07 00:00:00,0,71336,2016-04-07 14:36:53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96811,2016-03-05 14:06:24,10900,suv,2003,auto,245,unknown,150000,11,petrol,sonstige_autos,no,2016-03-05 00:00:00,0,56072,2016-04-06 22:15:33
314889,2016-03-05 14:06:24,799,small,1997,manual,60,polo,150000,11,petrol,volkswagen,no,2016-03-05 00:00:00,0,35708,2016-03-09 15:19:01
171387,2016-03-05 14:06:24,6200,convertible,2006,manual,135,megane,90000,9,petrol,renault,no,2016-03-05 00:00:00,0,35764,2016-03-24 09:16:44
213793,2016-03-05 14:06:23,3999,wagon,2003,manual,220,a6,150000,7,petrol,audi,no,2016-03-05 00:00:00,0,57080,2016-03-06 18:29:35


По логике отбросим все года, что после 2016-го и до 1900-го.

In [16]:
df = df[(df['RegistrationYear'] <= 2016) & (df['RegistrationYear'] >= 1900)]
df['RegistrationYear'].unique()

array([1993, 2011, 2004, 2001, 2008, 1995, 2014, 1998, 2005, 1910, 2016,
       2007, 2009, 2002, 1997, 1990, 1981, 2003, 1994, 1991, 1984, 2006,
       1999, 2012, 2010, 2000, 1992, 2013, 1996, 1985, 1989, 2015, 1982,
       1976, 1983, 1973, 1969, 1971, 1987, 1986, 1988, 1980, 1970, 1965,
       1945, 1925, 1974, 1979, 1955, 1978, 1972, 1968, 1977, 1961, 1966,
       1975, 1963, 1964, 1960, 1958, 1967, 1959, 1956, 1941, 1962, 1929,
       1957, 1940, 1949, 1937, 1951, 1953, 1954, 1950, 1948, 1952, 1932,
       1942, 1935, 1933, 1936, 1923, 1930, 1944, 1943, 1934, 1938, 1928,
       1931, 1947, 1927, 1946], dtype=int64)

In [17]:
df['Power'].value_counts()

0        32866
75       22144
60       14774
150      13750
101      12415
         ...  
1506         1
5809         1
15020        1
5867         1
1241         1
Name: Power, Length: 685, dtype: int64

In [18]:
df['Power'].unique()

array([    0,   190,   163,    75,    69,   102,   109,   125,   101,
         105,   140,   115,   131,    60,   136,   160,   231,    50,
         118,   193,    99,   113,   218,   122,   129,    70,   306,
          95,    61,   177,   170,    55,   143,   286,   232,   150,
         156,    80,    82,    90,   155,    54,   185,    87,   180,
          86,    84,   224,   235,   200,   178,   265,    77,   110,
         144,   120,   116,   184,   126,   204,    88,   194,    64,
         305,   197,   179,   250,    45,   313,    41,   165,    98,
         130,   114,   211,    56,   201,   213,    58,   107,    83,
         174,   100,   220,    73,   192,    68,    66,   299,    74,
          52,   147,   310,    71,    97,    65,   239,   203,     5,
         300,   103,    85,   258,   320,    63,    81,   148,    44,
         145,   280,   260,   104,   188,   333,   186,   117,   141,
         132,   234,   158,    39,    92,    51,   135,    59,   230,
          53,   209,

Здесь также по логике отбросим все строки с мощностью выше 1500 л.с., затем заменим нули на медианы при группировке по типу движка. 

In [19]:
df = df[df['Power'] <= 1500]

In [20]:
df['Power'] = df['Power'].replace(0, np.nan)
df.head()

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,unknown,1993,manual,,golf,150000,0,petrol,volkswagen,unknown,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190.0,unknown,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163.0,grand,125000,8,gasoline,jeep,unknown,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75.0,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69.0,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [21]:
VehicleType = df.groupby(by=['VehicleType'])['Power'].median()
VehicleType

VehicleType
bus            116.0
convertible    129.0
coupe          150.0
other          101.0
sedan          122.0
small           68.0
suv            150.0
unknown        100.0
wagon          136.0
Name: Power, dtype: float64

In [22]:
df['Power'] = df['Power'].fillna(df.groupby(by=['VehicleType'])['Power'].transform('median'))
df['Power'] = df['Power'].astype(int)
df.head()

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,unknown,1993,manual,100,golf,150000,0,petrol,volkswagen,unknown,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,unknown,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,unknown,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329588 entries, 0 to 343592
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   DateCrawled        329588 non-null  object
 1   Price              329588 non-null  int64 
 2   VehicleType        329588 non-null  object
 3   RegistrationYear   329588 non-null  int64 
 4   Gearbox            329588 non-null  object
 5   Power              329588 non-null  int32 
 6   Model              329588 non-null  object
 7   Kilometer          329588 non-null  int64 
 8   RegistrationMonth  329588 non-null  int64 
 9   FuelType           329588 non-null  object
 10  Brand              329588 non-null  object
 11  Repaired           329588 non-null  object
 12  DateCreated        329588 non-null  object
 13  NumberOfPictures   329588 non-null  int64 
 14  PostalCode         329588 non-null  int64 
 15  LastSeen           329588 non-null  object
dtypes: int32(1), int64(6

Отбросим признаки, которые нам не пригодятся для обучения наших моделей.

In [24]:
df.drop(columns = ['DateCrawled', 'RegistrationMonth', 'DateCreated', 'NumberOfPictures', 'PostalCode', 'LastSeen'], axis=1, inplace=True)
df.head(10)

Unnamed: 0,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,FuelType,Brand,Repaired
0,480,unknown,1993,manual,100,golf,150000,petrol,volkswagen,unknown
1,18300,coupe,2011,manual,190,unknown,125000,gasoline,audi,yes
2,9800,suv,2004,auto,163,grand,125000,gasoline,jeep,unknown
3,1500,small,2001,manual,75,golf,150000,petrol,volkswagen,no
4,3600,small,2008,manual,69,fabia,90000,gasoline,skoda,no
5,650,sedan,1995,manual,102,3er,150000,petrol,bmw,yes
6,2200,convertible,2004,manual,109,2_reihe,150000,petrol,peugeot,no
7,14500,bus,2014,manual,125,c_max,30000,petrol,ford,unknown
8,999,small,1998,manual,101,golf,150000,unknown,volkswagen,unknown
9,2000,sedan,2004,manual,105,3_reihe,150000,petrol,mazda,no


Мы закончили предобработку и теперь можем приступить к обучению наших моделей.

## Обучение моделей

Для начала, выделим признаки *features* и целевой признак *target*, а затем разделим исходные данные на обучающую и валидационную и тестовую выборки в соотношении 60:20:20. После чего, также выделим признаки и целевой признак для каждой выборки.

In [27]:
features = df.drop('Price', axis=1)
target = df['Price']

df_train, df_test = train_test_split(df, test_size=0.25, random_state=12345)

Затем, преобразуем категориальные признаки в численные, используя OneHotEncoder.

In [26]:
cat_columns = ['VehicleType', 'Gearbox', 'Model', 'FuelType', 'Brand', 'Repaired']

In [29]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

ohe_train_tmp = pd.DataFrame(enc.fit_transform(df_train[cat_columns]))
ohe_test_tmp = pd.DataFrame(enc.transform(df_test[cat_columns]))

ohe_train_tmp.index = df_train.index
ohe_test_tmp.index = df_test.index

num_train = df_train.drop(cat_columns, axis=1)
num_test = df_test.drop(cat_columns, axis=1)

df_train_ohe = pd.concat([num_train, ohe_train_tmp], axis=1)
df_test_ohe = pd.concat([num_test, ohe_test_tmp], axis=1)

In [30]:
print(df_train_ohe.shape)
print(df_test_ohe.shape)

(247191, 317)
(82397, 317)


In [31]:
features_train_ohe = df_train_ohe.drop(['Price'], axis=1)
target_train_ohe = df_train_ohe['Price']
features_test_ohe = df_test_ohe.drop(['Price'], axis=1)
target_test_ohe = df_test_ohe['Price']

### LightGBM ###

Обучим модель градиентного бустинга LightGBM. Для этой модели используем данные, кодированные с помощью OrdinalEncoder.

In [33]:
model_lgbm = lgbm.LGBMRegressor(random_state=12345)

In [34]:
pipe_lgbm = Pipeline([
    ('scaler', StandardScaler()),
    ('model_lgbm', model_lgbm)])

In [35]:
parameters = {'model_lgbm__num_leaves': [n for n in range(20, 201, 20)],
              'model_lgbm__n_estimators': [100, 200]}

In [36]:
model_lgbm = GridSearchCV(pipe_lgbm,
                          param_grid=parameters,
                          scoring='neg_root_mean_squared_error',
                          cv=5,
                          n_jobs=-1)
model_lgbm.fit(features_train_ohe, target_train_ohe)
print(model_lgbm.best_params_)
print(-1*model_lgbm.best_score_)

{'model_lgbm__n_estimators': 200, 'model_lgbm__num_leaves': 200}
1569.258534585984


Теперь обучим другие модели.

### Дерево решений ###

Для дерева решений снова используем данные, кодированные с помощью OrdinalEncoder.

In [37]:
model_dt = DecisionTreeRegressor(random_state=12345)

Подберем параметры для дерева решений.

In [38]:
pipe_dt = Pipeline([
    ('scaler', StandardScaler()),
    ('model_dt', model_dt)])

In [39]:
parameters_dt = {'model_dt__max_depth': [None, 1, 3, 5, 7, 9],
                 'model_dt__min_samples_leaf': [1, 2, 3, 4, 5]}

In [40]:
model_dt = GridSearchCV(pipe_dt, param_grid=parameters_dt, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1)
model_dt.fit(features_train_ohe, target_train_ohe)
print(model_dt.best_params_)
print(-1*model_dt.best_score_)

{'model_dt__max_depth': None, 'model_dt__min_samples_leaf': 5}
1814.06952687047


Теперь обучим модель Catboost регрессии.

### Catboost ###

In [50]:
model_cbr = CatBoostRegressor(verbose=False, random_state=12345)

In [51]:
pipe_cbr = Pipeline([
    ('scaler', StandardScaler()),
    ('model_cbr', model_cbr)])

In [52]:
param_cbr = {'model_cbr__depth' : [2,4,6,8,10],
             'model_cbr__learning_rate' : [0.01, 0.05, 0.1, 1],
             'model_cbr__n_estimators': [50, 100, 150, 200] }

In [54]:
model_cbr = GridSearchCV(pipe_cbr, param_grid=param_cbr, n_jobs=-1, scoring='neg_mean_absolute_error', verbose=False)
model_cbr.fit(features_train_ohe, target_train_ohe)
print(model_cbr.best_params_)
print(model_cbr.best_score_)features_test_ohe

{'model_cbr__depth': 10, 'model_cbr__learning_rate': 1, 'model_cbr__n_estimators': 200}
-1012.1013383895303


Нашей лучшей моделью оказалась Catboost. 

## Анализ моделей

Протестируем нашу лучшую модель на тестовой выборке, используя гиперпараметры, полученные выше.

In [58]:
%%time

model_cbr = CatBoostRegressor(random_state=12345, depth = 10, learning_rate = 1, n_estimators = 200, verbose=False)
model_cbr.fit(features_train_ohe, target_train_ohe)

Wall time: 7.36 s


<catboost.core.CatBoostRegressor at 0x28401bd4b50>

In [59]:
%%time

predictions_cbr = model_cbr.predict(features_test_ohe)
rmse = mean_squared_error(target_test_ohe, predictions_cbr)**0.5
print(rmse)

1647.9856270368364
Wall time: 180 ms


## Вывод

Итак, перед обучением моделей мы выполнили предобработку данных, удалив дубликаты, пропуски и выбросы, а также проработали нулевые значения. Затем мы отбросили ненужные признаки, потом преобразовали категориальные признаки в численные, и провели масштабирование. После чего разибили данные на обучающую и валидационную выборки и провели обучение трех моделей: LightGBM, Дерево решений и CatBoostRegressor.

Наилучший результат мы достигли у модели ***CatBoostRegressor***, ошибка **RMSE** на тестовой выборке получилась 1647.99, скорость предсказания при этом составила 180 мс.