In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Импорт библиотек**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import datetime

import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score

from catboost import CatBoostRegressor

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Пути к директориям и файлам

In [None]:
df_train = pd.read_csv('/kaggle/input/realestatepriceprediction/train.csv')
df_test = pd.read_csv('/kaggle/input/realestatepriceprediction/test.csv')
df_submission = pd.read_csv('/kaggle/input/realestatepriceprediction/sample_submission.csv')

# Анализ данных

Деление признаков на числовые и текстовые

In [None]:
feat_numeric = list(df_train.select_dtypes(exclude='object').columns)
feat_object = list(df_train.select_dtypes(include='object').columns)
target = 'Price'

feat_numeric

In [None]:
Идентификатор можно удалить, так как он не характеризует данные

In [None]:
feat_numeric.pop(0)

Поиск признаков с выбросами

In [None]:
df_train[feat_numeric].hist(
    figsize=(16,16)
)
plt.show()

Выбросы есть в: HouseYear, KitchenSquare.

Признаки с аномально высоким значением: HouseFloor, LifeSquare, Rooms, Square. Их нужно будет ограничить

In [None]:
df_train.describe().T

Признаки Rooms, KitchenSquare, HouseFloor имеют в некоторых наблюдениях нулевые значения

In [None]:
grid = sns.jointplot(df_train['Rooms'], df_train['Price'], kind='reg')
grid.fig.set_figwidth(8)
grid.fig.set_figheight(8)

In [None]:
grid = sns.jointplot(df_train['KitchenSquare'], df_train['Price'], kind='reg')
grid.fig.set_figwidth(8)
grid.fig.set_figheight(8)

Отсечение значений меньше 1 и больше 250

In [None]:
df_train_temp = df_train.loc[df_train['KitchenSquare']<250]
grid = sns.jointplot(df_train_temp['KitchenSquare'], df_train_temp['Price'], kind='reg')
grid.fig.set_figwidth(8)
grid.fig.set_figheight(8)

Значения менее 3 кв.м. и больше 30 кв.м. засчитаем за выброс

График распределения цены

In [None]:
target_mean = round(df_train['Price'].mean(), 2)
target_median = df_train['Price'].median()
target_mode = df_train['Price'].mode()[0]

plt.figure(figsize = (16, 8))

sns.distplot(df_train['Price'], bins=50)

y = np.linspace(0, 0.000005, 10)
plt.plot([target_mean] * 10, y, label='target_mean', linestyle=':',  linewidth=4)
plt.plot([target_median] * 10, y, label='target_median', linestyle='--',  linewidth=4)
plt.plot([target_mode] * 10, y, label='target_mode', linestyle='-.', linewidth=4)

plt.title('Распределение цены')
plt.legend()

Класс с подготовкой данных

In [None]:
class Data:
    
    def __init__(self):
        """Константы для обработки выбросов на основе анализа данных"""
        self.Square_min = 15
        self.Square_max = 300
        
        self.LifeSquare_min = 10
        self.LifeSquare_max = 280
        
        self.Rooms_min = 1
        self.Rooms_max = 5
        
        self.HouseFloor_min = 1
        self.HouseFloor_max = 50
        
        self.KitchenSquare_min = 3
        self.KitchenSquare_max = 30
        
        self.current_year = datetime.datetime.now().year
        
        self.medians = None
        self.DistrictId_value_counts = None
        self.SquareMeterPrice_by_DistrictId = None
        self.Healthcare_1_by_DistrictId = None
        
        
    def fit(self, df_train):
        
        # медианные значения
        self.medians = df_train[['LifeSquare', 'HouseFloor']].median()
        
        # подсчет популярных районов
        self.DistrictId_value_counts = dict(df_train['DistrictId'].value_counts())
        
        # подсчет средней цены за м2 по району
        df_train_temp = df_train.loc[((df_train['Square'] > self.Square_min) & (df_train['Square'] < self.Square_max))]
        df_train_temp["SquareMeterPrice"] = df_train_temp["Price"] / df_train_temp["Square"]
        self.SquareMeterPrice_by_DistrictId = df_train_temp.groupby('DistrictId', as_index=False)\
            .agg({'SquareMeterPrice': 'mean'})\
            .rename(columns={'SquareMeterPrice': 'AverageSquareMeterPrice'})
        
        # подсчет среднего значения признака Healthcare_1 по району
        self.Healthcare_1_by_DistrictId = df_train.groupby('DistrictId', as_index=False)\
            .agg({'Healthcare_1': 'mean'})\
            .rename(columns={'Healthcare_1': 'AverageHealthcare_1'})
        
        del df_train_temp
        
    def transform(self, df_train):
        
        # Обработка пропусков
        df_train[['LifeSquare', 'HouseFloor']] = df_train[['LifeSquare', 'HouseFloor']].fillna(self.medians)
        
        # Обработка выбросов
        
        # площадь
        df_train.loc[(df_train['Square'] > self.Square_max), 'Square'] = self.Square_max
        df_train.loc[(df_train['Square'] < self.Square_min), 'Square'] = self.Square_min
        
        # жилая площадь
        df_train.loc[(df_train['LifeSquare'] < self.LifeSquare_min), 'LifeSquare'] = self.LifeSquare_min
        df_train.loc[(df_train['LifeSquare'] > self.LifeSquare_max), 'LifeSquare'] = self.LifeSquare_max
        
        # площадь кухни
        df_train.loc[(df_train['KitchenSquare'] < self.KitchenSquare_min), 'KitchenSquare'] = self.KitchenSquare_min
        df_train.loc[(df_train['KitchenSquare'] > self.KitchenSquare_max), 'KitchenSquare'] = self.KitchenSquare_max
        
        # год постройки дома
        df_train.loc[(df_train['HouseYear'] > self.current_year), 'HouseYear'] = self.current_year
        
        # количество комнат
        df_train.loc[(df_train['Rooms'] > self.Rooms_max), 'Rooms'] = self.Rooms_max
        df_train.loc[(df_train['Rooms'] < self.Rooms_min), 'Rooms'] = self.Rooms_min
        
        # количество этажей
        df_train.loc[(df_train['HouseFloor'] < self.HouseFloor_min), 'HouseFloor'] = self.HouseFloor_min
        df_train.loc[(df_train['HouseFloor'] > self.HouseFloor_max), 'HouseFloor'] = self.HouseFloor_max
        
        # если этаж больше этажности дома, то присваиваем случайный этаж от self.HouseFloor_min до максимального этажа в доме
        floor_outliers = df_train.loc[df_train['Floor'] > df_train['HouseFloor']].index
        df_train.loc[floor_outliers, 'Floor'] = df_train.loc[floor_outliers, 'HouseFloor'].apply(lambda x: self.HouseFloor_min if (self.HouseFloor_min == x) else np.random.randint(self.HouseFloor_min, x))
        
        # Обработка категорий
        df_train = pd.concat([df_train, pd.get_dummies(df_train['Ecology_2'], prefix='Ecology_2', dtype='int8')], axis=1)
        df_train = pd.concat([df_train, pd.get_dummies(df_train['Ecology_3'], prefix='Ecology_3', dtype='int8')], axis=1)
        df_train = pd.concat([df_train, pd.get_dummies(df_train['Shops_2'], prefix='Shops_2', dtype='int8')], axis=1)
        
        return df_train
    
    def features(self, df_train):
        
        # добавление признака популярности района
        df_train['DistrictId_counts'] = df_train['DistrictId'].map(self.DistrictId_value_counts)
        df_train['DistrictId_counts'].fillna(df_train['DistrictId_counts'].median(), inplace=True)
        
        # добавление признака средней стоимости м2 по району
        df_train = df_train.merge(self.SquareMeterPrice_by_DistrictId, on=["DistrictId"], how='left')
        df_train['AverageSquareMeterPrice'].fillna(df_train['AverageSquareMeterPrice'].median(), inplace=True)
        
        # добавление признака среднего значения Healthcare_1 по району
        df_train = df_train.merge(self.Healthcare_1_by_DistrictId, on=["DistrictId"], how='left')
        df_train['AverageHealthcare_1'].fillna(df_train['AverageHealthcare_1'].median(), inplace=True)
        
        return df_train

Инициализация класса

In [None]:
data_inst = Data()

# тренировочные данные
data_inst.fit(df_train)
df_train = data_inst.transform(df_train)
df_train = data_inst.features(df_train)

# валидационные данные
df_test = data_inst.transform(df_test)
df_test = data_inst.features(df_test)

Список признаков, используемых в модели

In [None]:
feature_names = ['AverageSquareMeterPrice', 'DistrictId_counts', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
                    'HouseFloor', 'HouseYear', 'Helthcare_2', 'Ecology_1', 'Social_1', 'Social_2', 'Social_3',
                    'Shops_1', 'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B',
                    'AverageHealthcare_1']
target_name = 'Price'

In [None]:
df_train = df_train[feature_names + [target_name]]
df_test = df_test[feature_names + ['Id']]
X = df_train[feature_names]
y = df_train[target_name]

Обучение модели на CatBoostRegressor

Вычисления гиперпараметров модели при помощи randomized_search()
learning_rate=0.1 iterations=1150 depth=8

In [None]:
final_model = CatBoostRegressor(
    silent=True,
    learning_rate=0.1,
    iterations=1150,
    eval_metric='R2',
    depth=8
)

final_model.fit(X, y)

cv_score = cross_val_score(
    final_model,
    X,
    y,
    scoring='r2',
    cv=KFold(
            n_splits=5,
            shuffle=True,
            random_state=42
    )
)

print(f'R2: {round(cv_score.mean(), 3)}')

Сортировка признаков по важности

In [None]:
feature_importances = pd.DataFrame(
    zip(X.columns, final_model.get_feature_importance()),
    columns=['feature_name', 'importance']
)

feature_importances.sort_values(by='importance', ascending=False, inplace=True)
feature_importances.head(20)

In [None]:
Создание датафрейма с пердсказаниями

In [None]:
preds_final = pd.DataFrame()
preds_final['Id'] = df_test['Id'].copy()

df_test.set_index('Id', inplace=True)
df_test = df_test[feature_names]

In [None]:
y_pred_final = final_model.predict(df_test)

my_predictions['Price'] = y_pred_final
my_predictions.to_csv('./predictions.csv', index=False, encoding='utf-8', sep=',')

my_predictions.head()