In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
matplotlib.rcParams.update({'font.size': 14})

In [None]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Test R2:\t" + str(round(r2(test_true_values, test_pred_values), 3)))
    
    plt.figure(figsize=(18,10))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=test_pred_values, y=test_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')

    plt.show()

In [None]:
TRAIN_DATASET_PATH = '../input/real-estate-price-prediction-moscow/train.csv'
TEST_DATASET_PATH = '../input/real-estate-price-prediction-moscow/test.csv'

Описание датасета
1. 
1. Id - идентификационный номер квартиры
1. DistrictId - идентификационный номер района
1. Rooms - количество комнат
1. Square - площадь
1. LifeSquare - жилая площадь
1. KitchenSquare - площадь кухни
1. Floor - этаж
1. HouseFloor - количество этажей в доме
1. HouseYear - год постройки дома
1. Ecology_1, Ecology_2, Ecology_3 - экологические показатели местности
1. Social_1, Social_2, Social_3 - социальные показатели местности
1. Healthcare_1, Helthcare_2 - показатели местности, связанные с охраной здоровья
1. Shops_1, Shops_2 - показатели, связанные с наличием магазинов, торговых центров
1. Price - цена квартиры

In [None]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
train_df.tail()

In [None]:
train_df.dtypes

In [None]:
test_df = pd.read_csv(TEST_DATASET_PATH)
test_df.tail()

In [None]:
print('Строк в трейне:', train_df.shape[0])
print('Строк в тесте', test_df.shape[0])

**Приведение типов**

In [None]:
train_df.dtypes


 EDA
Делаем EDA для:

1. Исправления выбросов
1. Заполнения NaN
1. Идей для генерации новых фич

In [None]:
plt.figure(figsize = (16,8))

train_df['Price'].hist(bins=30)
plt.ylabel('Count')
plt.xlabel('Price')

plt.title('Target distriction')
plt.show()

In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
train_df['Id'] = train_df['Id'].astype('str')
train_df['DistrictId'] = train_df['DistrictId'].astype('str')

In [None]:
train_df.select_dtypes(include='object').columns.tolist()

In [None]:
train_df['DistrictId'].value_counts()

In [None]:
train_df['Ecology_2'].value_counts()

In [None]:
train_df['Ecology_3'].value_counts()

In [None]:
train_df['Shops_2'].value_counts()

 Обработка выбросов 
 Что можно делать с ними?

1. Выкинуть эти данные (только на трейне, на тесте ничего не выкидываем)
1. Заменять выбросы разными методами (медианы, средние значения, np.clip и т.д.)
1. Делать/не делать дополнительную фичу
1. Ничего не делать

In [None]:
train_df['Rooms'].value_counts()

In [None]:
train_df['Rooms_outlier'] = 0
train_df.loc[(train_df['Rooms'] == 0) | (train_df['Rooms'] >= 6), 'Rooms_outlier'] = 1
train_df.head()

In [None]:
train_df.loc[train_df['Rooms'] == 0, 'Rooms'] = 1
train_df.loc[train_df['Rooms'] >= 6, 'Rooms'] = train_df['Rooms'].median()

In [None]:
train_df['Rooms'].value_counts()

In [None]:
train_df['KitchenSquare'].value_counts()

In [None]:
train_df['KitchenSquare'].quantile(0.975), train_df['KitchenSquare'].quantile(0.05)

In [None]:
condition = (train_df['KitchenSquare'].isna()) \
    | (train_df['KitchenSquare'] > train_df['KitchenSquare'].quantile(.975))
train_df.loc[condition, 'KitchenSquare'] = train_df['KitchenSquare'].median()

train_df.loc[train_df['KitchenSquare'] < 3, 'KitchenSquare'] = 3

In [None]:
train_df['KitchenSquare'].value_counts()

**HouseFloor, Floor**

In [None]:
train_df['HouseFloor'].sort_values().unique()

In [None]:
train_df['Floor'].value_counts()

In [None]:
train_df[train_df['Floor'] > 27]

In [None]:
(train_df['Floor'] > train_df['HouseFloor']).sum()

In [None]:
train_df['HouseFloor_outlier'] = 0
train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
train_df.loc[train_df['Floor'] > train_df['HouseFloor'], 'HouseFloor_outlier'] = 1

In [None]:
train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor'] = train_df['HouseFloor'].median()


In [None]:
train_df.loc[train_df['Floor'] > train_df['HouseFloor'], 'HouseFloor'] = train_df['Floor']

In [None]:
(train_df['Floor'] == train_df['HouseFloor']).sum()

In [None]:
train_df['HouseFloor'].value_counts()

In [None]:
train_df['HouseFloor'].describe()

In [None]:
train_df.loc[train_df['HouseFloor'] > train_df['HouseFloor'].quantile(.975), 'HouseFloor'] = train_df['HouseFloor'].median()

In [None]:
(train_df['Floor'] > train_df['HouseFloor']).sum()

In [None]:
train_df.loc[train_df['Floor'] > train_df['HouseFloor'], 'HouseFloor'] = train_df['Floor']

In [None]:
(train_df['Floor'] > train_df['HouseFloor']).sum()

In [None]:
train_df['HouseFloor'].value_counts()

In [None]:
train_df['HouseFloor'].hist(bins=20)
plt.ylabel('count')
plt.xlabel('HouseFloor')
plt.show()

In [None]:
train_df['Floor'].hist(bins=20)
plt.ylabel('count')
plt.xlabel('Floor')
plt.show()

In [None]:
train_df[train_df['Floor'] > 30]

In [None]:
train_df.loc[train_df['Floor'] > 30, 'HouseFloor_outlier'] = 1

In [None]:
train_df.loc[train_df['Floor'] > 30, 'Floor'] = 30
train_df.loc[train_df['HouseFloor'] > 30, 'HouseFloor'] = 30


**HouseYear**

In [None]:
train_df['HouseYear'].sort_values(ascending=False)

In [None]:
train_df.loc[train_df['HouseYear'] > 2020, 'HouseYear'] = 2020

**Обработка пропусков**

In [None]:
train_df.isna().sum()

In [None]:
train_df[['Square', 'LifeSquare', 'KitchenSquare']].head(10)

In [None]:
train_df['LifeSqure_nan'] = train_df['LifeSquare'].isna() * 1
condition = (train_df['LifeSquare'].isna()) \
            & (~train_df['Square'].isna()) \
            & (~train_df['KitchenSquare'].isna())
train_df.loc[condition, 'LifeSquare'] = train_df.loc[condition, 'Square'] - train_df.loc[condition, 'KitchenSquare'] - 3

In [None]:
train_df.isna().sum()

**LIFESQUARE**

In [None]:
train_df['LifeSquare'].quantile(0.0255)

In [None]:
(train_df['LifeSquare'] < train_df['KitchenSquare']).sum()

In [None]:
train_df[train_df['LifeSquare'] == train_df['LifeSquare'].min()]

In [None]:
condition = (train_df['LifeSquare'].isna()) | (train_df['LifeSquare'] > train_df['LifeSquare'].quantile(0.975)) | \
(train_df['LifeSquare'] < train_df['LifeSquare'].quantile(0.025)) |  (train_df['LifeSquare'] < train_df['KitchenSquare'])
        
train_df.loc[condition, 'LifeSquare'] = train_df['LifeSquare'].median()

In [None]:
train_df['LifeSquare'].describe()

**Square**

In [None]:
train_df['Square'].describe()

In [None]:
train_df['Square'].quantile(0.025)

In [None]:
condition = (train_df['Square'].isna()) | (train_df['Square'] > train_df['Square'].quantile(0.975)) | (train_df['Square'] < train_df['Square'].quantile(0.025))
        
train_df.loc[condition, 'Square'] = train_df['Square'].median()

In [None]:
train_df['Square'].describe()

**ECOLOGY_1**

In [None]:
(train_df['Ecology_1']==0).sum()

In [None]:
class DataPreprocessing:
    """Подготовка исходных данных"""

    def __init__(self):
        """Параметры класса"""
        self.medians = None
        self.kitchen_square_quantile = None
        self.house_floor_quantile = None
        self.floor_quantile = None
        self.square_quantile_max = None
        self.square_quantile_min = None
        self.life_square_quantile_max = None
        self.life_square_quantile_min = None
        
    def fit(self, X):
        """Сохранение статистик"""       
        # Расчет медиан
        self.medians = X.median()
        self.kitchen_square_quantile = X['KitchenSquare'].quantile(.975)
        self.house_floor_quantile = X['HouseFloor'].quantile(.975)
        self.floor_quantile = X['Floor'].quantile(.975)
        self.square_quantile_max = X['Square'].quantile(.975)
        self.square_quantile_min = X['Square'].quantile(.025)
    
    def transform(self, X):
        """Трансформация данных"""
        
        #Ecology_1
        X.loc[X['Ecology_1'] == 0, 'Ecology_1'] = self.medians['Ecology_1']

        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        
        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms']
        
        # KitchenSquare
        condition = (X['KitchenSquare'].isna()) \
                    | (X['KitchenSquare'] > self.kitchen_square_quantile)
        
        X.loc[condition, 'KitchenSquare'] = self.medians['KitchenSquare']

        X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3
        
        # LifeSquare
        condition = (X['LifeSquare'].isna()) \
                    | (X['LifeSquare'] > self.life_square_quantile_max) | (X['LifeSquare'] < self.life_square_quantile_min) \
                    | (X['LifeSquare'] < X['KitchenSquare'])
        
        X.loc[condition, 'LifeSquare'] = self.medians['Square']
 
        
        # Square
        condition = (X['Square'].isna()) \
                    | (X['Square'] > self.square_quantile_max) | (X['Square'] < self.square_quantile_min)
        
        X.loc[condition, 'Square'] = self.medians['Square']

        
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X.loc[(X['HouseFloor'] == 0) & (X['HouseFloor'] > self.house_floor_quantile), 'HouseFloor_outlier'] = 1
        X.loc[(X['Floor'] > X['HouseFloor']) & (X['Floor'] > self.floor_quantile), 'HouseFloor_outlier'] = 1
        
        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']
        X.loc[X['HouseFloor'] > self.house_floor_quantile, 'HouseFloor'] = self.house_floor_quantile
        X.loc[X['Floor'] > self.floor_quantile, 'Floor'] = self.floor_quantile
        X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor'] = X['Floor']
              
        # HouseYear
        current_year = datetime.now().year
        
        X['HouseYear_outlier'] = 0
        X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1
        
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year
        
        # Healthcare_1
        if 'Healthcare_1' in X.columns:
            X.drop('Healthcare_1', axis=1, inplace=True)
            
        # LifeSquare
        X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1
        condition = (X['LifeSquare'].isna()) & \
                      (~X['Square'].isna()) & \
                      (~X['KitchenSquare'].isna())
        
        X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 3
        
        
        X.fillna(self.medians, inplace=True)
        
        # переменные для новых фич
        
        X['inv_round_square'] = round(1/X['Square'], 2)
        X['inv_round_KitchenSquare'] = round(1/X['KitchenSquare'], 2)
        X['round_square'] = round(X['Square'], 0)
    
        
        return X

**ПОСТРОЕНИЕ НОВЫХ ПРРИЗНАКОВ**

In [None]:
binary_to_numbers = {'A': 0, 'B': 1}
train_df['Ecology_2'] = train_df['Ecology_2'].replace(binary_to_numbers)
train_df['Ecology_3'] = train_df['Ecology_3'].replace(binary_to_numbers)
train_df['Shops_2'] = train_df['Shops_2'].replace(binary_to_numbers)

**District_Size, isDistrictLarge**

In [None]:
district_size = train_df['DistrictId'].value_counts().reset_index()\
                            .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
district_size.head()

In [None]:
train_df = train_df.merge(district_size, on='DistrictId', how='left')
train_df.head()

In [None]:
(train_df['DistrictSize'] > 100).value_counts()

In [None]:
train_df['IsDistrictLarge'] = (train_df['DistrictSize'] > 100).astype(int)
train_df.columns

**MedPriceBySquare, MedPriceByInvSquare, MedPriceByDistrict**

In [None]:
train_df['round_square'] = round(train_df['Square'], 0)

train_df['inv_round_square'] = round(1/train_df['Square'], 2)
train_df['inv_round_KitchenSquare'] = round(1/train_df['KitchenSquare'], 2)

In [None]:
med_price_by_invsquare = train_df.groupby(['inv_round_square', 'inv_round_KitchenSquare'], as_index=False).agg({'Price':'median'}).rename(columns={'Price':'MedPriceByInvSquare'})
med_price_by_invsquare

In [None]:
med_price_by_square = train_df.groupby(['round_square', 'KitchenSquare'], as_index=False).agg({'Price':'median'}).rename(columns={'Price':'MedPriceBySquare'})
med_price_by_square

In [None]:
med_price_by_district = train_df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'median'})\
                                    .rename(columns={'Price':'MedPriceByDistrict'})
med_price_by_district

In [None]:
train_df = train_df.merge(med_price_by_district, on=['DistrictId'], how='left')
train_df = train_df.merge(med_price_by_invsquare, on=['inv_round_square'], how='left')
train_df = train_df.merge(med_price_by_square, on=['round_square'], how='left')
train_df.head()

In [None]:
def floor_to_cat(X):

    X['floor_cat'] = 0

    X.loc[X['Floor'] <= 3, 'floor_cat'] = 1  
    X.loc[(X['Floor'] > 3) & (X['Floor'] <= 5), 'floor_cat'] = 2
    X.loc[(X['Floor'] > 5) & (X['Floor'] <= 9), 'floor_cat'] = 3
    X.loc[(X['Floor'] > 9) & (X['Floor'] <= 15), 'floor_cat'] = 4
    X.loc[X['Floor'] > 15, 'floor_cat'] = 5

    return X


def floor_to_cat_pandas(X):
    bins = [X['Floor'].min(), 3, 5, 9, 15, X['Floor'].max()]
    X['floor_cat'] = pd.cut(X['Floor'], bins=bins, labels=False)
    
    X['floor_cat'].fillna(-1, inplace=True)
    return X


def year_to_cat(X):

    X['year_cat'] = 0

    X.loc[X['HouseYear'] <= 1941, 'year_cat'] = 1
    X.loc[(X['HouseYear'] > 1941) & (X['HouseYear'] <= 1945), 'year_cat'] = 2
    X.loc[(X['HouseYear'] > 1945) & (X['HouseYear'] <= 1980), 'year_cat'] = 3
    X.loc[(X['HouseYear'] > 1980) & (X['HouseYear'] <= 2000), 'year_cat'] = 4
    X.loc[(X['HouseYear'] > 2000) & (X['HouseYear'] <= 2010), 'year_cat'] = 5
    X.loc[(X['HouseYear'] > 2010), 'year_cat'] = 6

    return X


def year_to_cat_pandas(X):
    bins = [X['HouseYear'].min(), 1941, 1945, 1980, 2000, 2010, X['HouseYear'].max()]
    X['year_cat'] = pd.cut(X['HouseYear'], bins=bins, labels=False)
    
    X['year_cat'].fillna(-1, inplace=True)
    return X

In [None]:
bins = [train_df['Floor'].min(), 3, 5, 9, 15, train_df['Floor'].max()]
pd.cut(train_df['Floor'], bins=bins, labels=False)

In [None]:
bins = [train_df['Floor'].min(), 3, 5, 9, 15, train_df['Floor'].max()]
pd.cut(train_df['Floor'], bins=bins)

In [None]:
train_df = year_to_cat(train_df)
train_df = floor_to_cat_pandas(train_df)


In [None]:
train_df.head()

In [None]:
train_df['feace'] = round(np.sqrt(train_df['Price']), 0)

In [None]:
med_social_by_sqrt_price = train_df.groupby([ 'Social_1', 'feace' ], as_index=False).agg({'Social_2':'median'}).rename(columns={'Social_2':'med_social_by_sqrt_price'})
med_social_by_sqrt_price.tail(25).sort_values

**MedPriceByFloorYear**

In [None]:
med_price_by_floor_year = train_df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'})\
                                .rename(columns={'Price':'MedPriceByFloorYear'})
med_price_by_floor_year.head()

In [None]:
med_price_by_floor_year[(med_price_by_floor_year.year_cat == 6) & (med_price_by_floor_year.floor_cat == 2)]

In [None]:
train_df = train_df.merge(med_price_by_floor_year, on=['year_cat'], how='left')
train_df.head()

In [None]:
class FeatureGenetator():
    """Генерация новых фич"""
    
    def __init__(self):
        self.DistrictId_counts = None
        self.binary_to_numbers = None
        self.med_price_by_district = None
        self.med_price_by_floor_year = None
      
    # Новые фичи по округленной обратной площади и по округленной площади
        self.med_price_by_invsquare = None
        self.med_price_by_square = None
        self.med_social_by_price = None
        
        self.house_year_max = None
        self.floor_max = None
        self.house_year_min = None
        self.floor_min = None
        self.district_size = None
        
    def fit(self, X, y=None):
        
        X = X.copy() 
        
        # Binary features
        self.binary_to_numbers = {'A': 0, 'B': 1}
        
        # DistrictID
        self.district_size = X['DistrictId'].value_counts().reset_index() \
                               .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
        #  фичи
        self.inv_round_square = round(1/X['Square'], 2)
        self.round_square = round(X['Square'], 0)
        self.inv_round_KitchenSquare = round(1/X['KitchenSquare'], 2)
        
        X['inv_round_square'] = round(1/X['Square'], 2)
        X['inv_round_KitchenSquare'] = round(1/X['KitchenSquare'], 2)   
        
        
        # Target encoding
        ## District, Rooms
        df = X.copy()
      
        if y is not None:
            df['Price'] = y.values
            
            self.med_price_by_district = df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'median'})\
                                            .rename(columns={'Price':'MedPriceByDistrict'})
            
            self.med_price_by_district_median = self.med_price_by_district['MedPriceByDistrict'].median()
            
            
        ## inv_round_square, inv_round_KitchenSquare
        
        
        if y is not None:
            df['Price'] = y.values
            
            self.med_price_by_invsquare = df.groupby(['inv_round_square', 'inv_round_KitchenSquare'], as_index=False).agg({'Price':'median'})\
                                            .rename(columns={'Price':'MedPriceByInvSquare'})
            
            self.med_price_by_invsquare_median = self.med_price_by_invsquare['MedPriceByInvSquare'].median()
            
            
        ## 'round_square', 'KitchenSquare'
        
        
        if y is not None:
            df['Price'] = y.values
            
            self.med_price_by_square = df.groupby(['round_square', 'KitchenSquare'], as_index=False).agg({'Price':'median'})\
                                            .rename(columns={'Price':'MedPriceBySquare'})
            
            self.med_price_by_square_median = self.med_price_by_square['MedPriceBySquare'].median()
        
        ## 'round_square', 'KitchenSquare'
        
        
        if y is not None:
            df['Price'] = y.values
            
            self.med_social_by_price = df.groupby(['Social_1','Social_2'], as_index=False).agg({'Price':'median'})\
                                            .rename(columns={'Price':'MedSocialByPrice'})
            
            self.med_social_by_price_median = self.med_social_by_price['MedSocialByPrice'].median()
            
        
        
        ## floor, year
        if y is not None:
            self.floor_max = df['Floor'].max()
            self.floor_min = df['Floor'].min()
            self.house_year_max = df['HouseYear'].max()
            self.house_year_min = df['HouseYear'].min()
            df['Price'] = y.values
            df = self.floor_to_cat(df)
            df = self.year_to_cat(df)
            self.med_price_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceByFloorYear'})
            self.med_price_by_floor_year_median = self.med_price_by_floor_year['MedPriceByFloorYear'].median()
        

        
    def transform(self, X):


        # Binary features
        X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers)  # self.binary_to_numbers = {'A': 0, 'B': 1}
        X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)
        X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)
        
        # DistrictId, IsDistrictLarge
        X = X.merge(self.district_size, on='DistrictId', how='left')
        
        X['new_district'] = 0
        X.loc[X['DistrictSize'].isna(), 'new_district'] = 1
        
        X['DistrictSize'].fillna(5, inplace=True)
        
        X['IsDistrictLarge'] = (X['DistrictSize'] > 100).astype(int)
        
        # More categorical features
        X = self.floor_to_cat(X)  # + столбец floor_cat
        X = self.year_to_cat(X)   # + столбец year_cat
        
        # Target encoding
        if self.med_price_by_district is not None:
            X = X.merge(self.med_price_by_district, on=['DistrictId', 'Rooms'], how='left')
            X['MedPriceByDistrict'].fillna(self.med_price_by_district_median, inplace=True)
            
        if self.med_price_by_floor_year is not None:
            X = X.merge(self.med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
            X['MedPriceByFloorYear'].fillna(self.med_price_by_floor_year_median, inplace=True)
        
        if self.med_price_by_invsquare is not None:
            X = X.merge(self.med_price_by_invsquare, on=['inv_round_square', 'inv_round_KitchenSquare'], how='left')
            
            X['MedPriceByInvSquare'].fillna(self.med_price_by_invsquare_median, inplace=True)
            
        if self.med_price_by_square is not None:
            X = X.merge(self.med_price_by_square, on=['round_square', 'KitchenSquare'], how='left')
            X['MedPriceBySquare'].fillna(self.med_price_by_square_median, inplace=True)
        
        if self.med_social_by_price is not None:
            X = X.merge(self.med_social_by_price, on=['Social_1', 'Social_2'], how='left')
            X['MedSocialByPrice'].fillna(self.med_social_by_price_median, inplace=True)
        
        
        
        return X
    
    def floor_to_cat(self, X):
        bins = [self.floor_min, 3, 5, 9, 15, self.floor_max]
        X['floor_cat'] = pd.cut(X['Floor'], bins=bins, labels=False)

        X['floor_cat'].fillna(-1, inplace=True)
        return X
     
    def year_to_cat(self, X):
        bins = [self.house_year_min, 1941, 1945, 1980, 2000, 2010, self.house_year_max]
        X['year_cat'] = pd.cut(X['HouseYear'], bins=bins, labels=False)

        X['year_cat'].fillna(-1, inplace=True)
        return X

****

In [None]:
train_df.columns.tolist()

**РАЗБИЕНИЕ НА train и test**

In [None]:
feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'HouseYear',
                 'Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 'Social_3',
                 'Helthcare_2', 'Shops_1', 'Shops_2','Floor', 'HouseFloor']
new_feature_names = ['DistrictSize', 'IsDistrictLarge',\
                     'MedPriceByFloorYear','MedPriceByInvSquare', 'MedPriceBySquare', 'MedSocialByPrice']
target_name = ['Price']

In [None]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
test_df = pd.read_csv(TEST_DATASET_PATH)

X = train_df.drop(columns=target_name)
y = train_df[target_name]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=21)

In [None]:
preprocessor = DataPreprocessing()
preprocessor.fit(X_train)

X_train = preprocessor.transform(X_train)
X_valid = preprocessor.transform(X_valid)
test_df = preprocessor.transform(test_df)

X_train.shape, X_valid.shape, test_df.shape

In [None]:
preprocessor.medians

In [None]:
features_gen = FeatureGenetator()
features_gen.fit(X_train, y_train)

X_train = features_gen.transform(X_train)
X_valid = features_gen.transform(X_valid)
test_df = features_gen.transform(test_df)

X_train.shape, X_valid.shape, test_df.shape

In [None]:
X_train.columns

In [None]:
X_train = X_train[feature_names + new_feature_names]
X_valid = X_valid[feature_names + new_feature_names]
test_df = test_df[feature_names + new_feature_names]

In [None]:
X_train.isna().sum().sum(), X_valid.isna().sum().sum(), test_df.isna().sum().sum()


In [None]:
X_valid.info()

**ПОСТРОЕНИЕ МОДЕЛИ**

In [None]:
import numpy as np
import pandas as pd
import pickle   # сохранение модели

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 2. Разделение датасета
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

# 3. Модели
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler

# 4. Метрики качества
from sklearn.metrics import mean_squared_error as mse, r2_score as r2

# 5. Для визуализации внешних картинок в ноутбуке
from IPython.display import Image

In [None]:
X_train.head()

In [None]:
import seaborn as sns


In [None]:
def evaluate_preds(true_values, pred_values, save=False):
    """Оценка качества модели и график preds vs true"""
    
    print("R2:\t" + str(round(r2(true_values, pred_values), 3)) + "\n" +
          "RMSE:\t" + str(round(np.sqrt(mse(true_values, pred_values)), 3)) + "\n" +
          "MSE:\t" + str(round(mse(true_values, pred_values), 3))
         )
    
    plt.figure(figsize=(8, 8))
    
    plt.scatter(x=pred_values, y=true_values)
    plt.plot([0, 500000], [0, 500000], linestyle='--', color='black')  # диагональ, где true_values = pred_values
    
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('True vs Predicted values')
    
    if save == True:
        plt.savefig(REPORTS_FILE_PATH + 'report.png')
    plt.show()

In [None]:
rf_model = RandomForestRegressor(criterion='mse',
                                 max_depth=7, # глубина дерева  
                                 min_samples_leaf=35, # минимальное кол-во наблюдений в листе дерева
                                 random_state=21, 
                                 n_estimators=100  # кол-во деревьев
                                 )

rf_model.fit(X_train, y_train)
y_train_preds = rf_model.predict(X_train)
evaluate_preds(y_train, y_train_preds)

In [None]:
y_test_preds = rf_model.predict(X_valid)
evaluate_preds(y_valid, y_test_preds)

In [None]:
gb_model = GradientBoostingRegressor(criterion='mse',
                                     max_depth=4,
                                     min_samples_leaf=15,
                                     random_state=42,  
                                     n_estimators=100)
gb_model.fit(X_train, y_train)

y_train_preds = gb_model.predict(X_train)
evaluate_preds(y_train, y_train_preds)

In [None]:
y_test_preds = gb_model.predict(X_valid)
evaluate_preds(y_valid, y_test_preds)

In [None]:
feature_importances = pd.DataFrame(zip(X_train.columns, gb_model.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False)

In [None]:
test_df.shape

In [None]:
X_train.shape

In [None]:
submit = pd.read_csv('/kaggle/input/real-estate-price-prediction-moscow/sample_submission.csv')
submit.tail()

In [None]:
predictions = gb_model.predict(test_df)
predictions

In [None]:
submit['Price'] = predictions
submit.tail()

In [None]:
submit.head()

In [None]:
submit.to_csv('gd6_submit.csv', index=False)