In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import math
import scipy.stats as st
from sklearn import preprocessing

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_log_error, make_scorer, mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import pickle

# plt.style.use('ggplot')
# plt.rcParams['figure.figsize'] = (18,12)

In [2]:
def rmsle(y, y_hat):
    res = 0
    for i, j in zip(y, y_hat):
        res += (math.log(i + 1) - math.log(j + 1)) ** 2
    return (res / len(y)) ** 0.5

def rmse(y_actual, y_pred):
    return np.sqrt(mean_squared_error(y_actual, y_pred))

rmsle_score = make_scorer(rmsle, greater_is_better=False)

rmse_score = make_scorer(rmse, greater_is_better=False)

## Data Uploading

In [3]:
# объединенный трейн и тест датасеты с геогр.координатами
with open('df.pkl', 'rb') as f:
    df_coordinates = pickle.load(f)
f.close()
del f

In [4]:
df_train = df_coordinates[df_coordinates['sampletype']=='train']
df_train.shape

(262893, 42)

In [5]:
df_test = df_coordinates[df_coordinates['sampletype']=='test']
df_test.shape

(129485, 42)

In [6]:
del df_coordinates

## Test

## EDA

In [7]:
def df_print_unique(df_input):
    df_output = df_input.fillna(-100)
    for col in df_output.columns:
        print(df_output[col].value_counts())
        print(f'nunique = {df_output[col].nunique()}')
        print("~"*50)
        
#df_print_unique(df_train)

In [8]:
# for col in df_train.columns:
#     print(df_train.groupby('Рубрика')[col].count())
#     print('---------------------------------')

## Filling NaN

In [9]:
# for col in df_train.columns:
#     print(col)
#     print(pd.unique(df_train[col].values))
#     print('---------------------------------')

In [10]:
# кол-во пропусков в процентах
#df_train.isna().sum()*100/df_train.shape[0]

In [11]:
def fill_nan(df_input):
    df_output = df_input.copy()
    fill_with_minus_1 = ['Коммунальные платежи включены',
                  'Находится в залоге',
                  'Внесен задаток',
                  'Возможен обмен',
                  'Ипотека',
                  'Чистая продажа',
                  'Торг',
                  'Интернет',
                  'Наличие мебели',
                  'Наличие холодильника',
                  'Площадь кухни',
                  'Жилая площадь',
                  'Общая площадь',
                  'Год сдачи',
                  'Санузел',
                  'Ремонт',
    ]
    
    fill_with_other = ['Ориентир',
                      'Тип квартиры',
                      'Планировка',
                      'Форма собственности',
                      'Тип дома',       
                      'Материал дома'                     
    ]
    
    fill_with_unkn = ['Улица',
                      'Номер дома',
                      'Район', 
                      'Микрорайон'
    ]
    
    df_output[fill_with_minus_1] = df_output[fill_with_minus_1].fillna(value=-1)
    df_output[fill_with_other] = df_output[fill_with_other].fillna(value='другое')
    df_output[fill_with_unkn] = df_output[fill_with_unkn].fillna(value='не указано')
    
    return df_output

## Data Cleaning (preprocessing outliers + garbage)

In [12]:
def clean_test_data(df_input):
    df_output = df_input.copy()

    df_output['Форма собственности'] = df_output['Форма собственности'].replace(
        {'другое (укажите в описании)': 'другое'}) 
    df_output['Материал дома'] = df_output['Материал дома'].replace({'кирпич-монолит':'кирпич - монолит'})
    
    # исправляем выборосы из координат: сравниваем реальные координаты городов и регионов с раскодированными, удаляем
    # слишком далекие друг от друга (по евклидовому расстоянию), заменяем на средние lon и lat по городу
#     df_tmp = pd.merge(
#                 df_output, df_output.groupby(['Регион', 'Город']).agg({'lat': 'mean', 'lon': 'mean'}).reset_index(), 
#                 how='left', 
#                 left_on=['Регион', 'Город'], 
#                 right_on=['Регион', 'Город'])
#     df_tmp['dist'] = df_tmp.apply(
#         lambda r: ((r['lat_x'] - r['lat_y'])**2 + (r['lon_x'] - r['lon_y'])**2)**0.5, axis=1)
#     for i in df_tmp[(~df_tmp.location_full_match) & (df_tmp.dist > 5)].index.values:
#         if str(df_tmp.loc[i, 'Регион']).lower() not in str(df_tmp.loc[i, 'location_display_name']).lower() \
#         and str(df_tmp.loc[i, 'Город']).lower() not in str(df_tmp.loc[i, 'location_display_name']).lower():
#             df_output.loc[i, 'lat'] = df_tmp.loc[i, 'lat_y']
#             df_output.loc[i, 'lon'] = df_tmp.loc[i, 'lon_y']
            
            
    # удаляем выборосы из всей выборки
    outliers = [] 
    
    outliers.extend(df_output[(df_output['Цена']>=10000000)&(df_output['Общая площадь']<=50)|
                              (df_output['Цена']>20000000)&(df_output['Общая площадь']<100)|
                              (df_output['Цена']==350000000)|
                              (df_output['Цена']==111111111)|
                              (df_output['Цена']==85000000)|
                              (df_output['Цена']==68000000)|
                              (df_output['Общая площадь']<=5)
                             ].index)
    outliers.extend(df_output[(df_output['Этажность']>52)].index)
    outliers.extend(df_output[(df_output['Количество лоджий']>20)|(df_output['Количество балконов']>20)].index)
    outliers.extend(
        df_output[
            (df_output['Общая площадь']>400)|
            (df_output['Площадь кухни']>df_output['Общая площадь'])|
            (df_output['Жилая площадь']>df_output['Общая площадь'])].index
    )
    
    df_output = df_output.drop(df_output.index[outliers])
    print('Удалено общих выбросов:', len(outliers))
    
#     # ДЛЯ АРЕНДЫ
#     outliers_rent = []
#     outliers_rent.extend(
#         (df_output[(df_output['Рубрика']=='Аренда')&(
#             (df_output['Цена']<1000)|
#             (df_output['Цена']>150000))]).index)
#     df_output = df_output.drop(outliers_rent)
#     print('Удалено выбросов аренды:', len(outliers_rent))
    
#     # ДЛЯ ПРОДАЖИ
#     outliers_sale = []
#     outliers_sale.extend(
#         (df_output[(df_output['Рубрика']=='Продажа')&(
#             (df_output['Цена']<50000)|
#             (df_output['Цена']>45000000))]).index)
#     df_output = df_output.drop(outliers_sale)
#     print('Удалено выбросов продаж:', len(outliers_sale))
    
    return df_output

In [13]:
# plt.scatter(df_train['lat'], df_train['lon'])
# plt.scatter(df_train_preproc_NA_OUT['lat'], df_train_preproc_NA_OUT['lon'])
# plt.ylabel('lon')
# plt.xlabel('lat')

## Feature Encoding

In [14]:
# for c in df_train_preproc_NA_OUT.columns:
#     if df_train_preproc_NA_OUT[c].dtype == 'object':
#         print(df_train_preproc_NA_OUT.groupby(c)[c].count())
#         print('----------------------------')

In [15]:
def feature_encoding(df_input):
    df_output = df_input.copy()
    
    features_to_ohe = [] # порядок на переменных не задан
    
    df_output['Количество комнат'] = df_output['Количество комнат'].replace({'комната': 0.5}).astype('float64')
    
    df_output['Ремонт'] = df_output['Ремонт'].replace(
                                                        {
                                                        'требует капитального ремонта':1,
                                                        'требует косметического ремонта':10,
                                                        'хорошее':20,
                                                        'в отличном состоянии':100},
    ).astype('int64') 
    
    df_output['Санузел'] = df_output['Санузел'].replace(
                                                        {'другое':1,
                                                        'совмещенный':2,
                                                        'раздельный':3,
                                                        'несколько':6}
    ).astype('int64') 
    
    df_output['Планировка'] = df_output['Планировка'].replace(
                                                        {'другое': -1,
                                                        'смежная':1,
                                                        'смежно-изолированная':10,
                                                        'свободная':20,
                                                        'изолированная':100},
    ).astype('int64') 
    
    df_output['Тип квартиры'] = df_output['Тип квартиры'].replace(
                                                        {'другое': -1,
                                                        'гостинка':1,
                                                        'студия':2,
                                                        'улучшенной планировки':3,
                                                        'индивидуальной планировки':4,
                                                        'двухуровневая':5,
                                                        'пентхаус':6}
     ).astype('int64')
        
    df_output['Материал дома'] = df_output['Материал дома'].replace(
                                                        {'другое': -1,
                                                        'сборный железобетон':1,
                                                        'панель':2,
                                                        'монолит':3,
                                                        'кирпич - монолит':4,
                                                        'кирпич':5,
                                                        'шлакоблоки':6,
                                                        'силикатные блоки':7,
                                                        'бетонные блоки':8,
                                                        'бетонные блоки - монолит':9,
                                                        'дерево':9}
     ).astype('int64')
    
    
    df_output['Форма собственности'] = df_output['Форма собственности'].replace(
                                                        {'другое': -1,
                                                        'не оформлена':1,
                                                        'государственная':2,
                                                        'предварительный договор':3,
                                                        'участие в ЖСК':4,
                                                        'кооперативная':5,
                                                        'инвестиционная':6,
                                                        'договор долевого участия':7,
                                                        'частная':8,
                                                        'свидетельство о праве собственности':9}
     ).astype('int64')
    
    df_output['Тип дома'] = df_output['Тип дома'].replace(
                                                        {'другое': -1,
                                                        'общежитие':1,
                                                        'секционного типа':2,
                                                        'малоэтажка':3,
                                                        'ульяновка':4,
                                                        'хрущевка':5,
                                                        'брежневка':6,
                                                        'сталинка':7,
                                                        'ленинградский проект':8,
                                                        '93 серия':9,
                                                        '95 серия':10,
                                                        '97 серия':11,
                                                        '97 серия улучшенная':12,
                                                        '121 серия':13,
                                                        '121Т':14,
                                                        'спецпроект':15}
     ).astype('int64')
       
    return df_output

In [16]:
df_train_preproc_NA_OUT_ENC = df_train.pipe(fill_nan).pipe(clean_test_data).pipe(feature_encoding)

Удалено общих выбросов: 210


In [17]:
# def fill_na_with_predict (df_input):
#     df_output = df_input.copy()
    
#     # заполняем пропуски Общей площади методом ближайших соседей
#     knn = KNeighborsRegressor(n_neighbors=15, weights='distance')
#     knn.fit(df_output.loc[df_output['Общая площадь'].notna(), ['lat', 'lon', 'Количество комнат']], 
#             df_output.loc[df_output['Общая площадь'].notna(), 'Общая площадь'])
#     df_output.loc[df_output['Общая площадь'].isna(), 'Общая площадь'] = \
#     knn.predict(df_output.loc[df_output['Общая площадь'].isna(), ['lat', 'lon', 'Количество комнат']])
    
#     return df_output

#df_train_preproc_NA_OUT_ENC = df_train_preproc_NA_OUT_ENC.pipe(fill_na_with_predict)

In [18]:
#df_print_unique(df_train_preproc_NA_OUT_ENC)

In [19]:
# for c in df_train_preproc_NA_OUT_ENC.columns:
#     if df_train_preproc_NA_OUT_ENC[c].dtype == 'object':
#         print(df_train_preproc_NA_OUT.groupby(c)[c].count())
#         print('----------------------------')

In [20]:
def check_skewness_log(col, train):
    sns.distplot(np.log1p(train[col]+1), fit=st.norm);
    fig = plt.figure()
    (mu, sigma) = st.norm.fit(train[col])

In [21]:
def check_skewness(col, train):
    sns.distplot(train[col]+1, fit=st.norm);
    fig = plt.figure()
    (mu, sigma) = st.norm.fit(train[col])

In [22]:
# for c in df_train_preproc_NA_OUT_ENC.columns:
#     if df_train_preproc_NA_OUT_ENC[c].dtype != 'object':
#         check_skewness_log(c, df_train_preproc_NA_OUT_ENC)

## Visualization

In [23]:
# plt.figure(figsize=(20,10))
# sns.heatmap(df_train_preproc_NA_OUT_ENC.corr('kendall'), annot=True, linewidths=.5)

In [24]:
# # most correlated features
# corrmat = df_train_preproc_NA_OUT_ENC.corr()
# top_corr_features = corrmat.index[abs(corrmat["Цена"])>0.3]
# plt.figure(figsize=(7,5))
# g = sns.heatmap(df_train_preproc_NA_OUT_ENC[top_corr_features].corr(),annot=True,cmap="RdYlGn")

## Feature engineering  

In [25]:
import re

In [26]:
def add_features(df_input):
    df_output = df_input.copy()

    centroid = df_output[['Город', 'lat', 'lon']].groupby('Город').mean()
    df_output = df_output.merge(centroid, on='Город', suffixes=('_h', '_centr'), right_index=True)
    df_output['dist_center'] = ((df_output.lat_h-df_output.lat_centr)**2+(df_output.lon_h-df_output.lon_centr)**2)**0.5
    df_output['dist_center2'] = np.log1p(df_output['dist_center'])
    df_output['house_age'] = df_output['Год сдачи'].apply(lambda x: -1 if x<0 else 2019-x if 2019-x>0 else 0.5)
    df_output['not_first_floor'] = df_output['Этаж'].apply(lambda x: 0 if x==1 else -1 if x==-1 else 1)
    df_output['tmp'] = df_output['Этаж']/df_output['Этажность']
    df_output['not_top_floor'] = df_output['tmp'].apply(lambda x: 0 if x==1 else -1 if x<0 else 1)
    df_output['has_balcony'] = (df_output['Количество балконов'].apply(lambda x: 0 if x<=0 else 1) + \
                                    df_output['Количество лоджий'].apply(lambda x: 0 if x<=0 else 1))/2
    df_output['index_price_perv'] = df_output['Регион'].replace(
                                                        {
                                                        'Новосибирская область':105.7,
                                                        'Свердловская область':103.3,
                                                        'Омская область':102.9,
                                                        'Тюменская область':103.0,
                                                        'Челябинская область':101.0,
                                                        'Красноярский край':102.1,
                                                        'Пермский край':106.4,
                                                        'Архангельская область':100.7,
                                                        }
                            )
    df_output['index_price_vtor'] = df_output['Регион'].replace(
                                                        {
                                                        'Новосибирская область':101.8,
                                                        'Свердловская область':103.0,
                                                        'Омская область':105.1,
                                                        'Тюменская область':100.3,
                                                        'Челябинская область':100.1,
                                                        'Красноярский край':101.2,
                                                        'Пермский край':92.4,
                                                        'Архангельская область':101.4,
                                                        }
                            )
    df_output['zp'] = df_output['Регион'].replace(
                                                        {
                                                        'Новосибирская область':37173,
                                                        'Свердловская область':36853,
                                                        'Омская область':34136,
                                                        'Тюменская область':72289,
                                                        'Челябинская область':36619,
                                                        'Красноярский край':48980,
                                                        'Пермский край':37173,
                                                        'Архангельская область':51236,
                                                        }
                            )
    df_output['metro'] = df_output['Ориентир'].apply(lambda x: 1 if re.search('метро', str(x)) else -1 )
    enc = preprocessing.LabelEncoder()
    df_output['town'] = enc.fit_transform(df_output['Город'])
    
    #df_output.plot(x='metro', y='Цена', kind='scatter')
    
    cols_to_drop = [
        'Регион',
        'Город',
        'Улица',
        'Номер дома',
        'Район',
        'Микрорайон',
        'Ориентир',
        'Рубрика', 
        'sampletype',
        'address',
        'location_display_name',
        'location_full_match',
    ]
    
    df_output = df_output.drop(cols_to_drop, axis=1)
    
    return df_output

## Modeling

In [30]:
# Добавляем новые фичи, делим тестовую выбрку на аренду и продажу, отделяем таргет (цену)
df_train_rent = df_train_preproc_NA_OUT_ENC[df_train_preproc_NA_OUT_ENC['Рубрика'] == 'Аренда'].pipe(add_features)
df_train_sale = df_train_preproc_NA_OUT_ENC[df_train_preproc_NA_OUT_ENC['Рубрика'] == 'Продажа'].pipe(add_features)

cols_to_drop_rent = [
        'Цена', 'Торг', 'Чистая продажа', 'Ипотека', 'lat_centr', 'lon_centr',
        'Возможен обмен', 'Внесен задаток', 'Находится в залоге', 'tmp', 'town',
        'index_price_perv', 'index_price_vtor', 'zp', 'dist_center2'
]

cols_to_drop_sale = [
       'Цена', 'Наличие мебели', 'Наличие холодильника', 'Коммунальные платежи включены', 
       'lat_centr', 'lon_centr', 'tmp', 'index_price_perv', 'index_price_vtor', 'zp', 'dist_center2'
]

df_train_rent_target = df_train_rent['Цена']
df_train_sale_target = df_train_sale['Цена']

df_train_rent.drop(cols_to_drop_rent, axis=1, inplace=True)
df_train_sale.drop(cols_to_drop_sale, axis=1, inplace=True)

del df_train_preproc_NA_OUT_ENC
del df_train

In [31]:
# sns.pairplot(df_train_rent, vars=['Количество комнат', 'Общая площадь', 'Жилая площадь', 'Площадь кухни',
#        'Этаж', 'Этажность','Цена'], diag_kind='hist')

In [32]:
# sns.pairplot(df_train_rent, vars=['lat_h', 'lon_h', 'lat_centr',
#        'lon_centr', 'dist_center', 'dist_center2', 'dist_center3',
#        'dist_center4', 'house_age', 'house_age2', 'index_price_perv',
#        'index_price_vtor', 'inds', 'zp', 'zp2', 'metro', 'Цена'], diag_kind='hist')

In [33]:
print('Аренда:',df_train_rent.columns,'\n-----\n',df_train_rent.shape,df_train_rent_target.shape,'\n-----')
print('Продажа:',df_train_sale.columns,'\n-----\n',df_train_sale.shape,df_train_sale_target.shape,'\n-----')

Аренда: Index(['Количество комнат', 'Общая площадь', 'Жилая площадь', 'Площадь кухни',
       'Этаж', 'Этажность', 'Тип квартиры', 'Планировка', 'Ремонт', 'Санузел',
       'Количество балконов', 'Количество лоджий', 'Наличие мебели',
       'Наличие холодильника', 'Интернет', 'Форма собственности',
       'Коммунальные платежи включены', 'Год сдачи', 'Тип дома',
       'Материал дома', 'Количество фотографий', 'lat_h', 'lon_h',
       'dist_center', 'house_age', 'not_first_floor', 'not_top_floor',
       'has_balcony', 'metro'],
      dtype='object') 
-----
 (94642, 29) (94642,) 
-----
Продажа: Index(['Количество комнат', 'Общая площадь', 'Жилая площадь', 'Площадь кухни',
       'Этаж', 'Этажность', 'Тип квартиры', 'Планировка', 'Ремонт', 'Санузел',
       'Количество балконов', 'Количество лоджий', 'Интернет', 'Торг',
       'Чистая продажа', 'Ипотека', 'Возможен обмен', 'Внесен задаток',
       'Находится в залоге', 'Форма собственности', 'Год сдачи', 'Тип дома',
       'Материал до

## Rent

In [34]:
def rent_model_01(X_train, X_test, y_train):
        class RentKNNImputer(BaseEstimator, TransformerMixin):
            def fit(self, X, y):
                return self
            def transform(self, X):
                cols = [
                    'Количество комнат', 
                    'Общая площадь', 
                    'Площадь кухни', 
                    'lat_h', 
                    'lon_h',
                    'Жилая площадь',
                    'dist_center',
                    'dist_center2',
                    'dist_center3',
                    'dist_center4',
                    'house_age'
            ]
                return X[cols].copy()

        pipe = Pipeline([
            ('imputer', RentKNNImputer()),
            ('scaler', StandardScaler()),
            ('knn', KNeighborsRegressor()),
        ])

        param_grid = {
            'knn__n_neighbors': [12],
            'knn__p': [1]
        }
        
        kfold = KFold(n_splits=3, shuffle=True, random_state=0)
        hyper_search = GridSearchCV(pipe, param_grid, scoring=rmse_score, cv=kfold, n_jobs=2, refit=True, verbose=2)
        hyper_search.fit(X_train, y_train)
        model = hyper_search.best_estimator_
        print(f"RentKNNModel: best_params = {hyper_search.best_params_}, best_score = {hyper_search.best_score_}")
        y_pred = model.predict(X_test)
        
        return y_pred

In [35]:
X_train, X_test, y_train, y_test = train_test_split(df_train_rent, df_train_rent_target, random_state=0, test_size=0.5)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [36]:
pred_knn = rent_model_01(X_train, X_test, y_train)
pred_knn

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


KeyError: "['dist_center4', 'dist_center2', 'dist_center3'] not in index"

In [None]:
print(f"score = {rmsle(np.expm1(y_test), np.expm1(pred_knn))}") #score = 0.23534122382702258

In [None]:
pred_knn.shape

In [None]:
X_test_knn = X_test.copy()
X_test_knn = pd.concat([X_test_knn, pd.Series(pred_knn, index=X_test.index, name='knn')], axis=1, sort=False)

In [None]:
X_test_knn['knn'] = np.log1p(X_test_knn['knn'])
y_test_knn = y_test.copy()
#y_test_knn = np.log1p(y_test)

In [None]:
def rent_model_02(X_train, X_test, y_train):
        class RentLRImputer(BaseEstimator, TransformerMixin):
            def fit(self, X, y):
                return self

            def transform(self, X):
                cols = [
                    'Количество комнат', 
                    'Общая площадь', 
                    'Жилая площадь', 
                    'Площадь кухни',
                    'Этаж', 
                    'Этажность', 
                    'Тип квартиры', 
                    'Планировка', 
                    'Ремонт', 
                    'Санузел',
                    'Количество балконов', 
                    'Количество лоджий', 
                    'Интернет', 
                    'Торг',
                    'Чистая продажа', 
                    'Ипотека', 
                    'Возможен обмен', 
                    'Внесен задаток',
                    'Находится в залоге', 
                    'Форма собственности', 
                    'Год сдачи', 
                    'Тип дома',
                    'Материал дома', 
                    'Количество фотографий', 
                    'lat_h', 
                    'lon_h', 
                    'dist_center', 
                    'dist_center2',
#                     'dist_center3',
#                     'dist_center4',
                    'index_price_perv',
                    'index_price_vtor',
                    #'inds',
                    'zp',
                    'metro',
                    'house_age',
                    'not_first_floor',
                    'not_top_floor',
                    'has_balcony',
                    #'region',
                    #'town',
                    #'urb'
                ]
                return X[cols].copy()

        pipeline = Pipeline([
            ('imputer', RentLRImputer()),
            ('scaler', MinMaxScaler()),
            ('lr', Ridge()),
        ])

        param_grid = {
            'lr__alpha': [1, 10, 100, 1000],
        }
        
        hyper_search = GridSearchCV(pipeline, param_grid, scoring=rmse_score, cv=3, n_jobs=5, refit=True, verbose=2)
        hyper_search.fit(X_train, y_train)
        model = hyper_search.best_estimator_
        print(f"RentLRModel:best_params = {hyper_search.best_params_}, best_score = {hyper_search.best_score_}")
        y_pred = model.predict(X_test)
        return y_pred

In [None]:
#X_train2, X_test2, y_train2, y_test2 = train_test_split(X_test_knn, y_test_knn, random_state=0, test_size=0.3)
#y2 = np.log1p(y_test_knn)

In [None]:
pred_lr = rent_model_02(X_train2, X_test2, y_train2)

In [None]:
print(f"score = {rmsle(np.expm1(y_test2), np.expm1(pred_lr))}") #score = 0.4293694740000044

In [None]:
def rent_model_03(X_train, X_test, y_train):
        class RentRFRImputer(BaseEstimator, TransformerMixin):
            def fit(self, X, y):
                return self
            def transform(self, X):
                cols = [
                    'Количество комнат', 
                    'Общая площадь', 
                    'Площадь кухни', 
                    'Этаж',
                    'Этажность', 
                    'Тип квартиры', 
                    'Планировка', 
                    'Ремонт', 
                    'Санузел',
                    'Количество балконов', 
                    'Количество лоджий', 
                    'Наличие мебели',
                    'Наличие холодильника', 
                    'Интернет',
                    'Коммунальные платежи включены', 
                    'Тип дома', 
                    'Материал дома',
                    'Количество фотографий', 
                    'lat_h', 
                    'lon_h',
                    'Год сдачи',
                    'Жилая площадь',
                    'dist_center',
                    'dist_center2',
                    'dist_center3',
                    'dist_center4',
                    'index_price_perv',
                    'index_price_vtor',
                     'inds',
                     'zp',
#                     'zp2',
                    'metro',
                    'house_age',
                    #'not_first_not_top',
                    'not_first_floor',
                    'not_top_floor',
                    'has_balcony',
                    'region',
                    'town',
                    #'urb'
            ]
                return X[cols].copy()

        pipe = Pipeline([
            ('imputer', RentRFRImputer()),
            ('rfr', RandomForestRegressor()),
        ])

        param_grid={
            'rfr__max_depth': [25],
            'rfr__n_estimators': [10],
            #'rfr__max_features': ['sqrt'],
            #'rfr__min_samples_split': range(2, 10),
            #'rfr__min_samples_leaf': range(1, 500, 50),
            #'rfr__warm_start': [True],
            #'rfr__bootstrap': [False],
        },
        
        kfold = KFold(n_splits=3, shuffle=True, random_state=0)
        hyper_search = GridSearchCV(pipe, param_grid, scoring=rmsle_score, cv=kfold, n_jobs=2, refit=True, verbose=50)
        hyper_search.fit(X_train, y_train)
        model = hyper_search.best_estimator_
        print(f"RentRFRModel: best_params = {hyper_search.best_params_}, best_score = {hyper_search.best_score_}")
        
#         # посмотрим на важность признаков
#         features = X_train.columns
#         importances = hyper_search.best_estimator_.named_steps['rfr'].feature_importances_
#         indices = np.argsort(importances)
#         plt.title('Feature Importances')
#         plt.barh(range(len(indices)), importances[indices], color='b', align='center')
#         plt.yticks(range(len(indices)), [features[i] for i in indices])
#         plt.xlabel('Relative Importance')
#         plt.show()
        
        prediction = model.predict(X_test)
        
        if 'Цена' in X_test.columns or 'Цена' in X_train.columns: print('\n=======ALERT DATA LEAK!======\n')
        
        return pd.Series(prediction, index=X_test.index, name='Цена')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train_rent, df_train_rent_target, random_state=0, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
pred_train_rent_rfr = rent_model_03(X_train, X_test, y_train)
print(pred_train_rent_rfr.shape)
pred_train_rent_rfr

In [None]:
print(f"score = {rmsle(y_test, pred_train_rent_rfr)}") # score = 0.35763077798677656, 10 деревьев, 
# score = 0.35028523871246453 на 100

In [None]:
pred_rfr_knn = rent_model_03(X_train2, X_test2, y_train2)

In [None]:
print(f"score = {rmsle(np.expm1(y_test2), np.expm1(pred_rfr_knn))}") #

In [None]:
# def rfr_model(X, y):
# # Perform Grid-Search
#     gsc = GridSearchCV(
#         estimator=RandomForestRegressor(),
#         param_grid={
#             'max_depth': range(3,7),
#             'n_estimators': (10, 50, 100, 1000),
#         },
#         cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
    
#     grid_result = gsc.fit(X, y)
#     best_params = grid_result.best_params_
    
#     rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"],                               random_state=False, verbose=False)
# # Perform K-Fold CV
#    scores = cross_val_score(rfr, X, y, cv=10, scoring='neg_mean_absolute_error')

#     return scores

# scores = cross_val_score(rfr, X, y, cv=10, scoring='neg_mean_absolute_error')
# predictions = cross_val_predict(rfr, X, y, cv=10)

## Sale

In [None]:
def sale_model_01(X_train, X_test, y_train):
        class SaleKNNImputer(BaseEstimator, TransformerMixin):
            def fit(self, X, y):
                return self
            def transform(self, X):
                cols = [
                    'Количество комнат', 
                    'Общая площадь', 
                    'Жилая площадь', 
                    'Площадь кухни',
                    'Этаж', 
                    'Этажность', 
                    'Тип квартиры', 
                    'Планировка', 
                    'Ремонт', 
                    'Санузел',
                    'Количество балконов', 
                    'Количество лоджий', 
                    'Интернет', 
                    'Торг',
                    'Чистая продажа', 
                    'Ипотека', 
                    'Возможен обмен', 
                    'Внесен задаток',
                    'Находится в залоге', 
                    'Форма собственности', 
                    'Год сдачи', 
                    'Тип дома',
                    'Материал дома', 
                    'Количество фотографий', 
                    'lat_h', 
                    'lon_h', 
                    'dist_center', 
                    'dist_center2'
                ]
                
                return X[cols].copy()

        pipeline = Pipeline([
            ('imputer', SaleKNNImputer()),
            ('scaler', StandardScaler()),
            ('knn', KNeighborsRegressor()),
        ])

        param_grid = {
            'knn__n_neighbors': [5],
            'knn__p': [1]
        }
        
        kfold = KFold(n_splits=3, shuffle=True, random_state=0)
        hyper_search = GridSearchCV(pipeline, param_grid, scoring=rmse, cv=kfold, n_jobs=5, refit=True, verbose=50)
        hyper_search.fit(X_train, y_train)
        model = hyper_search.best_estimator_
        print(f"RentKNNModel: best_params = {hyper_search.best_params_}, best_score = {hyper_search.best_score_}")
        y_pred = model.predict(X_test)
        
        return y_pred

In [None]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X2, y2, random_state=0, test_size=0.5)
X_train3.shape, X_test3.shape, y_train3.shape, y_test3.shape

In [None]:
y_train3 = np.log1p(y_train3)
y_test3 = np.log1p(y_test3)

In [None]:
# pred_knn_sale = sale_model_01(X_train3, X_test3, y_train3)
# pred_knn_sale;

In [None]:
print(f"score = {rmsle(np.expm1(y_test3), np.expm1(pred_knn_sale))}") #score = 0.2634184160979688

In [None]:
pred_knn_sale.shape

In [None]:
def sale_model_02(X_train, X_test, y_train):
        class SaleLRImputer(BaseEstimator, TransformerMixin):
            def fit(self, X, y):
                return self

            def transform(self, X):
                cols = [
                    'Количество комнат', 
                    'Общая площадь', 
                    'Жилая площадь', 
                    'Площадь кухни',
                    'Этаж', 
                    'Этажность', 
                    'Тип квартиры', 
                    'Планировка', 
                    'Ремонт', 
                    'Санузел',
                    'Количество балконов', 
                    'Количество лоджий', 
                    'Интернет', 
                    'Торг',
                    'Чистая продажа', 
                    'Ипотека', 
                    'Возможен обмен', 
                    'Внесен задаток',
                    'Находится в залоге', 
                    'Форма собственности', 
                    'Год сдачи', 
                    'Тип дома',
                    'Материал дома', 
                    'Количество фотографий', 
                    'lat_h', 
                    'lon_h', 
                    'dist_center', 
                    'dist_center2',
                    'dist_center3',
                    'dist_center4',
                    'index_price_perv',
                    'index_price_vtor',
                    'inds'
                ]
                return X[cols].copy()

        pipeline = Pipeline([
            ('imputer', SaleLRImputer()),
            ('scaler', MinMaxScaler()),
            ('lr', Ridge()),
        ])

        param_grid = {
            'lr__alpha': [1, 10, 100, 1000],
        }
        
        hyper_search = GridSearchCV(pipeline, param_grid, scoring=rmse, cv=3, n_jobs=5, refit=True, verbose=2)
        hyper_search.fit(X_train, y_train)
        model = hyper_search.best_estimator_
        print(f"RentLRModel:best_params = {hyper_search.best_params_}, best_score = {hyper_search.best_score_}")
        y_pred = model.predict(X_test)
        return y_pred

In [None]:
X_test_knn3 = X_test3.copy()
X_test_knn3 = pd.concat([X_test_knn3, pd.Series(pred_knn_sale, index=X_test3.index, name='knn')], axis=1, sort=False)

In [None]:
X_test_knn3['knn'] = np.log1p(X_test_knn3['knn'])
y_test_knn3 = y_test3.copy()
#y_test_knn = np.log1p(y_test)

In [None]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X_test_knn3, y_test_knn3, random_state=0, test_size=0.3)
X_train4.shape, X_test4.shape, y_train4.shape, y_test4.shape

In [None]:
pred_lr_sale = sale_model_02(X_train4, X_test4, y_train4)
pred_lr_sale.shape

In [None]:
print(f"score = {rmsle(np.expm1(y_test4), np.expm1(pred_lr_sale))}") #score = 0.3327222930861954

In [None]:
def sale_model_03(X_train, X_test, y_train):
        class SaleRFRImputer(BaseEstimator, TransformerMixin):
            def fit(self, X, y):
                return self
            def transform(self, X):
                cols = [
                    'Количество комнат', 
                    'Общая площадь', 
                    'Жилая площадь', 
                    'Площадь кухни',
                    'Этаж', 
                    'Этажность', 
                    'Тип квартиры', 
                    'Планировка', 
                    'Ремонт', 
                    'Санузел',
                    'Количество балконов', 
                    'Количество лоджий', 
                    'Интернет', 
                    'Торг',
                    'Чистая продажа', 
                    'Ипотека', 
                    'Возможен обмен', 
                    'Внесен задаток',
                    'Находится в залоге', 
                    'Форма собственности', 
                    'Год сдачи', 
                    'Тип дома',
                    'Материал дома', 
                    'Количество фотографий', 
                    'lat_h', 
                    'lon_h', 
                    'dist_center', 
                    'dist_center2',
                    #'dist_center3',
                    #'dist_center4',
                    'index_price_perv',
                    'index_price_vtor',
                    'inds',
                    #'zp',
                    #'zp2',
                    #'metro',
                    'house_age',
                    #'not_first_not_top',
                    'not_first_floor',
                    'not_top_floor',
                    'has_balcony',
                    'region',
                    'town',
                    'urb'
                ]
                return X[cols].copy()

        pipe = Pipeline([
            ('imputer', SaleRFRImputer()),
            ('rfr', RandomForestRegressor()),
        ])

        param_grid={
            'rfr__max_depth': [25],
            'rfr__n_estimators': [10],
            #'rfr__max_features': ['sqrt'],
            #'rfr__min_samples_split': range(2, 10),
            #'rfr__min_samples_leaf': range(1, 500, 50),
            #'rfr__warm_start': [False, True],
            #'rfr__bootstrap': [False],
        },
        
        kfold = KFold(n_splits=3, shuffle=True, random_state=0)
        hyper_search = GridSearchCV(pipe, param_grid, scoring=rmsle_score, cv=kfold, n_jobs=2, refit=True, verbose=50)
        hyper_search.fit(X_train, y_train)
        model = hyper_search.best_estimator_
        print(f"SaleRFRModel: best_params = {hyper_search.best_params_}, best_score = {hyper_search.best_score_}")
        
#         # посмотрим на важность признаков
#         features = X_train.columns
#         importances = hyper_search.best_estimator_.named_steps['rfr'].feature_importances_
#         indices = np.argsort(importances)
#         plt.title('Feature Importances')
#         plt.barh(range(len(indices)), importances[indices], color='b', align='center')
#         plt.yticks(range(len(indices)), [features[i] for i in indices])
#         plt.xlabel('Relative Importance')
#         plt.show()
        
        prediction = model.predict(X_test)
                
        if 'Цена' in X_test.columns or 'Цена' in X_train.columns: print('\n=======ALERT DATA LEAK!======\n')
        
        return pd.Series(prediction, index=X_test.index, name='Цена')

In [None]:
X_train5, X_test5, y_train5, y_test5 = train_test_split(df_train_sale, df_train_sale_target, random_state=0, test_size=0.3)
X_train5.shape, X_test5.shape, y_train5.shape, y_test5.shape

In [None]:
pred_train_sale_rfr = sale_model_03(X_train5, X_test5, y_train5)
print(pred_train_sale_rfr.shape)
pred_train_sale_rfr

In [None]:
print(f"score = {rmsle(y_test5, pred_train_sale_rfr)}") 

In [None]:
pred_rfr_sale_knn = sale_model_03(X_train4, X_test4, y_train4)

In [None]:
print(f"score = {rmsle(np.expm1(y_test4), np.expm1(pred_rfr_sale_knn))}") #score = 0.3582030741056765

## Test

## Test Data Preprocessing

In [37]:
def clean_test_data(df_input):
    df_output = df_input.copy()

    df_output['Форма собственности'] = df_output['Форма собственности'].replace(
        {'другое (укажите в описании)': 'другое'}) 
    df_output['Материал дома'] = df_output['Материал дома'].replace({'кирпич-монолит':'кирпич - монолит'})
    
    return df_output

In [38]:
df_test_preproc_NA_OUT_ENC = df_test.pipe(fill_nan).pipe(clean_test_data).pipe(feature_encoding)
df_test_preproc_NA_OUT_ENC.shape

(129485, 42)

## Fit Rent

In [39]:
df_test_rent = \
df_test_preproc_NA_OUT_ENC[df_test_preproc_NA_OUT_ENC['Рубрика'] == 'Аренда'].pipe(add_features)
df_test_rent.drop(cols_to_drop_rent, axis=1, inplace=True)
df_test_rent.shape

(46203, 29)

In [40]:
#pred_test_rent_rfr = rent_model_03(df_train_rent, df_test_rent, df_train_rent_target)

In [41]:
model = RandomForestRegressor(n_estimators=100, verbose=2, n_jobs=2)
model.fit(df_train_rent, df_train_rent_target)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   14.8s


building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80

[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   39.2s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                      oob_score=False, random_state=None, verbose=2,
                      warm_start=False)

In [42]:
pred_test_rent_rfr = model.predict(df_test_rent)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.6s finished


In [43]:
pred_test_rent_rfr.shape # best_score = -0.35762453844675574

(46203,)

In [44]:
pred_test_rent_rfr = pd.Series(pred_test_rent_rfr, index=df_test_rent.index, name='Цена')
pred_test_rent_rfr

ID
262893     9074.000000
262995    12460.900000
263094     9213.970000
263139    12264.111111
263255     6110.000000
              ...     
376736    16814.893333
379733    10936.000000
380969    10446.000000
385940    54400.000000
389381    13380.000000
Name: Цена, Length: 46203, dtype: float64

## Fit Sale

In [47]:
df_sale_test = df_test_preproc_NA_OUT_ENC[df_test_preproc_NA_OUT_ENC['Рубрика'] == 'Продажа']
df_sale_test.shape

(83282, 42)

In [48]:
del df_test_preproc_NA_OUT_ENC

In [49]:
df_test_sale = df_sale_test.pipe(add_features)
df_test_sale.drop(cols_to_drop_sale, axis=1, inplace=True)
df_test_sale.shape

(83282, 33)

In [50]:
#pred_test_sale_rfr = sale_model_03(df_train_sale, df_test_sale, df_train_sale_target)

In [51]:
model = RandomForestRegressor(n_estimators=100, verbose=2, n_jobs=2)
model.fit(df_train_sale, df_train_sale_target)
pred_test_sale_rfr = model.predict(df_test_sale)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100building tree 10 of 100

building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   32.5s


building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80

[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.3s finished


In [52]:
pred_test_sale_rfr.shape # best_score = -0.1689683180613796

(83282,)

In [53]:
pred_test_sale_rfr = pd.Series(pred_test_sale_rfr, index=df_test_sale.index, name='Цена')
pred_test_sale_rfr

ID
262894    4.966040e+06
262907    2.355790e+06
262916    3.277580e+06
262933    8.076307e+06
262935    3.885026e+06
              ...     
384173    3.852862e+06
385000    9.176400e+05
385271    3.611681e+06
387660    1.921060e+06
389962    1.167900e+06
Name: Цена, Length: 83282, dtype: float64

In [54]:
del model

## Combining predictions

In [55]:
print(pred_test_rent_rfr.shape)
pred_test_rent_rfr

(46203,)


ID
262893     9074.000000
262995    12460.900000
263094     9213.970000
263139    12264.111111
263255     6110.000000
              ...     
376736    16814.893333
379733    10936.000000
380969    10446.000000
385940    54400.000000
389381    13380.000000
Name: Цена, Length: 46203, dtype: float64

In [56]:
pred_rent = pd.Series(pred_test_rent_rfr, index=df_test_rent.index, name='Цена')
pred_rent.shape
pred_rent

ID
262893     9074.000000
262995    12460.900000
263094     9213.970000
263139    12264.111111
263255     6110.000000
              ...     
376736    16814.893333
379733    10936.000000
380969    10446.000000
385940    54400.000000
389381    13380.000000
Name: Цена, Length: 46203, dtype: float64

In [57]:
print(pred_test_sale_rfr.shape)
pred_test_sale_rfr

(83282,)


ID
262894    4.966040e+06
262907    2.355790e+06
262916    3.277580e+06
262933    8.076307e+06
262935    3.885026e+06
              ...     
384173    3.852862e+06
385000    9.176400e+05
385271    3.611681e+06
387660    1.921060e+06
389962    1.167900e+06
Name: Цена, Length: 83282, dtype: float64

In [58]:
pred_sale = pd.Series(pred_test_sale_rfr, index=df_sale_test.index, name='Цена');
pred_sale.shape
pred_sale

ID
262894    4.966040e+06
262895    2.383010e+06
262896    1.450560e+07
262906    2.404970e+06
262907    2.355790e+06
              ...     
392372    2.227447e+06
392374    1.793154e+06
392375    2.058733e+06
392376    2.118886e+06
392377    2.259131e+06
Name: Цена, Length: 83282, dtype: float64

In [59]:
pred = pd.concat([pred_rent, pred_sale], axis=0).sort_index()
pred.shape
pred

ID
262893    9.074000e+03
262894    4.966040e+06
262895    2.383010e+06
262896    1.450560e+07
262897    9.648000e+03
              ...     
392373    1.340300e+04
392374    1.793154e+06
392375    2.058733e+06
392376    2.118886e+06
392377    2.259131e+06
Name: Цена, Length: 129485, dtype: float64

In [60]:
pd.DataFrame(zip(pred.index.values, pred.values), columns=['ID', 'Цена']).to_csv('pred_01_comp.csv', sep=',', index=False)

In [61]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
