### Подключение библиотек и скриптов

In [47]:
import numpy as np
import pandas as pd
import pickle


from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.spatial.distance import cdist
from mpl_toolkits.mplot3d.axes3d import Axes3D

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score as r2, mean_absolute_error as mae, mean_squared_error as mse
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [48]:
import warnings
warnings.filterwarnings('ignore')

In [49]:
matplotlib.rcParams.update({'font.size': 14})

In [50]:
# заменяет Площади плохих кухонь (анпример больше 1000) на средние в зависимости от размера обющей площади
def Correct_BadSquare(df, IndexBadSquare, ColumnBadSquare='KitchenSquare', percentiles=[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99,1], ):
   
    # разбиваем датасет по колонке Square на процентили Чтобы вычислить медиану для каждого поромежутка
    qcut=pd.qcut(df.Square, percentiles)
    df_qcut=pd.DataFrame({'Interval': qcut} )
    # Вычисляем среднюю площадь кухонь  по интервалам квантелям Общей площади
    meanSquareByPeriods=df.groupby(qcut).mean()[ColumnBadSquare].rename('meanKS')
    # Создаем датафрейм где в колонке Interval вставлены интервалы (квантили) площади Square, полученный с помощью qcut
    # а в колонке meanKS - средня площадь по кухням  
    df_meanKS=df_qcut.merge(meanSquareByPeriods,how='left',left_on='Interval', right_on='Square', sort=False, copy=True).set_index(df.index)
    df_meanKS
    # IndexBadKitchens= df[df['KitchenSquare']>1000].index
    df.loc[IndexBadSquare, ColumnBadSquare]= df_meanKS.loc[IndexBadSquare, 'meanKS']

In [51]:
def reduce_dims_to_2D_space_with_PCA(df):
    pca = PCA(n_components=2)
    components = pca.fit_transform(df)
    return pd.DataFrame(data = components, columns = ['component_1', 'component_2'])
    
def reduce_dims_to_3D_space_with_PCA(df):
    pca = PCA(n_components=3)
    components = pca.fit_transform(df)
    return pd.DataFrame(data = components, columns = ['component_1', 'component_2', 'component_3'])

def reduce_dims_to_2D_space_with_TSNE(df):
    tsne = TSNE(n_components=2, learning_rate=250, random_state=42)
    components = tsne.fit_transform(df)
    return pd.DataFrame(data = components, columns = ['component_1', 'component_2'])
    
def reduce_dims_to_3D_space_with_TSNE(df):
    tsne = TSNE(n_components=3, learning_rate=250, random_state=42)
    components = tsne.fit_transform(df)
    return pd.DataFrame(data = components, columns = ['component_1', 'component_2', 'component_3'])  

**Пути к директориям и файлам**

In [52]:
PREPARED_FULL_PATH = 'train_full.csv'
MODEL_TRAIN_PATH = 'model.pkl' 
MODEL_TRAIN_PATH_FULL = 'model_full.pkl' 


### Загрузка данных

**Описание датасета**

* **Id** - идентификационный номер квартиры
* **DistrictId** - идентификационный номер района
* **Rooms** - количество комнат
* **Square** - площадь
* **LifeSquare** - жилая площадь
* **KitchenSquare** - площадь кухни
* **Floor** - этаж
* **HouseFloor** - количество этажей в доме
* **HouseYear** - год постройки дома
* **Ecology_1**, **Ecology_2**, **Ecology_3** - экологические показатели местности
* **Social_1**, **Social_2**, **Social_3** - социальные показатели местности
* **Healthcare_1**, **Helthcare_2** - показатели местности, связанные с охраной здоровья
* **Shops_1**, **Shops_2** - показатели, связанные с наличием магазинов, торговых центров
* **Price** - цена квартиры

# Random Forest

# ФИНАЛ

In [53]:
#https://www.kaggle.com/c/realestatepriceprediction/leaderboard
df_full=pd.read_csv(PREPARED_FULL_PATH)
# устанавливаем колонку id  как индекс у данных train
df_full.set_index("Id", inplace=True)
df_full

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,...,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,components_3d_0,components_3d_1,components_3d_2,components_3d_3,components_3d_4,Price
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.089040,33,...,0,1,0,1,0,0,0,0,1,184966.930730
15053,41,3.0,65.683640,40.049543,8.0,7,9.0,1978,0.000070,46,...,0,1,0,1,0,0,0,1,0,300009.450063
4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,34,...,0,1,0,1,0,0,0,0,1,220925.908524
5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,23,...,0,1,0,1,1,0,0,0,0,175616.227217
10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,35,...,0,1,0,1,1,0,0,0,0,150226.531644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8180,11,3.0,67.133911,50.809797,6.0,5,9.0,1973,0.000170,36,...,0,1,0,1,0,0,0,0,1,
4695,1,1.0,40.198472,21.807061,10.0,12,17.0,2017,0.007122,1,...,0,1,0,1,0,0,1,0,0,
5783,12,3.0,77.842178,48.282625,9.0,23,22.0,1989,0.090799,74,...,0,1,0,1,0,1,0,0,0,
4780,62,2.0,81.305222,60.912515,0.0,4,0.0,1977,0.072158,2,...,0,1,1,0,0,0,1,0,0,


In [54]:
X=df_full[df_full['Price'].notna()].copy()
X

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,...,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,components_3d_0,components_3d_1,components_3d_2,components_3d_3,components_3d_4,Price
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.089040,33,...,0,1,0,1,0,0,0,0,1,184966.930730
15053,41,3.0,65.683640,40.049543,8.0,7,9.0,1978,0.000070,46,...,0,1,0,1,0,0,0,1,0,300009.450063
4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,34,...,0,1,0,1,0,0,0,0,1,220925.908524
5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,23,...,0,1,0,1,1,0,0,0,0,175616.227217
10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,35,...,0,1,0,1,1,0,0,0,0,150226.531644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,32,2.0,50.401785,30.476203,5.0,6,5.0,1968,0.135650,46,...,0,1,0,1,0,0,0,0,1,196684.316040
6159,18,1.0,41.521546,20.539216,9.0,13,13.0,2000,0.000000,30,...,0,1,1,0,1,0,0,0,0,189050.289571
5123,27,1.0,47.939008,32.711291,1.0,12,16.0,2015,0.072158,2,...,0,1,1,0,0,0,1,0,0,159143.805370
5400,75,2.0,43.602562,33.840147,8.0,1,5.0,1961,0.307467,30,...,1,0,0,1,1,0,0,0,0,181595.339808


In [55]:
XTest=df_full[df_full['Price'].isna()].copy()
XTest.drop(['Price'], axis=1, inplace=True)
XTest

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,...,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,components_3d_0,components_3d_1,components_3d_2,components_3d_3,components_3d_4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,11,...,1,0,1,0,1,0,0,1,0,0
15856,74,2.0,69.263183,45.348216,1.0,6,1.0,1977,0.075779,6,...,1,0,1,0,1,0,0,1,0,0
5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.000000,30,...,1,0,1,0,1,0,0,0,0,1
15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,23,...,1,0,1,0,1,1,0,0,0,0
14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,2,...,1,0,1,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8180,11,3.0,67.133911,50.809797,6.0,5,9.0,1973,0.000170,36,...,1,0,1,0,1,0,0,0,0,1
4695,1,1.0,40.198472,21.807061,10.0,12,17.0,2017,0.007122,1,...,1,0,1,0,1,0,0,1,0,0
5783,12,3.0,77.842178,48.282625,9.0,23,22.0,1989,0.090799,74,...,1,0,1,0,1,0,1,0,0,0
4780,62,2.0,81.305222,60.912515,0.0,4,0.0,1977,0.072158,2,...,1,0,1,1,0,0,0,1,0,0


In [56]:
columns=X.columns
columns

Index(['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor',
       'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Ecology_2_A', 'Ecology_2_B',
       'Ecology_3_A', 'Ecology_3_B', 'Shops_2_A', 'Shops_2_B',
       'components_3d_0', 'components_3d_1', 'components_3d_2',
       'components_3d_3', 'components_3d_4', 'Price'],
      dtype='object')

In [57]:
y=X['Price']
y

Id
14038    184966.930730
15053    300009.450063
4765     220925.908524
5809     175616.227217
10783    150226.531644
             ...      
77       196684.316040
6159     189050.289571
5123     159143.805370
5400     181595.339808
6306     218714.077615
Name: Price, Length: 10000, dtype: float64

In [58]:
X.drop(['Price'], axis=1, inplace=True)
X

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,...,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,components_3d_0,components_3d_1,components_3d_2,components_3d_3,components_3d_4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.089040,33,...,1,0,1,0,1,0,0,0,0,1
15053,41,3.0,65.683640,40.049543,8.0,7,9.0,1978,0.000070,46,...,1,0,1,0,1,0,0,0,1,0
4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,34,...,1,0,1,0,1,0,0,0,0,1
5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,23,...,1,0,1,0,1,1,0,0,0,0
10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,35,...,1,0,1,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,32,2.0,50.401785,30.476203,5.0,6,5.0,1968,0.135650,46,...,1,0,1,0,1,0,0,0,0,1
6159,18,1.0,41.521546,20.539216,9.0,13,13.0,2000,0.000000,30,...,1,0,1,1,0,1,0,0,0,0
5123,27,1.0,47.939008,32.711291,1.0,12,16.0,2015,0.072158,2,...,1,0,1,1,0,0,0,1,0,0
5400,75,2.0,43.602562,33.840147,8.0,1,5.0,1961,0.307467,30,...,1,1,0,0,1,1,0,0,0,0


In [59]:
# разобьем наш датасет , презназначенный для обучения колонки LifeSquare на два датасета (обучающего и валидного)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=50)
X_train

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,...,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,components_3d_0,components_3d_1,components_3d_2,components_3d_3,components_3d_4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11575,58,1.0,43.230274,29.173209,1.0,13,17.0,1977,0.437885,23,...,1,0,1,0,1,1,0,0,0,0
12280,101,1.0,31.857473,22.659669,5.0,3,9.0,1966,0.225825,41,...,0,0,1,0,1,0,0,0,0,1
5821,58,1.0,44.063070,29.173209,1.0,3,1.0,1977,0.437885,23,...,1,0,1,0,1,1,0,0,0,0
7872,48,1.0,41.812188,21.079607,8.0,2,12.0,1981,0.041125,46,...,1,0,1,0,1,0,0,0,1,0
9552,23,1.0,105.432181,106.178175,0.0,3,2.0,1977,0.014073,2,...,1,0,1,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15186,27,1.0,40.422035,40.262914,1.0,8,17.0,2016,0.011654,4,...,1,0,1,0,1,0,0,1,0,0
15244,31,3.0,50.914858,39.397964,4.0,4,4.0,1960,0.151346,32,...,1,1,0,0,1,1,0,0,0,0
2401,30,2.0,66.144841,45.348216,1.0,8,17.0,1977,0.000078,22,...,1,0,1,0,1,0,0,0,0,1
10282,19,1.0,42.674884,21.245851,10.0,4,16.0,1978,0.309479,35,...,1,0,1,0,1,0,0,0,0,1


In [60]:
from sklearn.ensemble import RandomForestRegressor
parameters = {
    'n_estimators': [100, 150, 200],
    'max_features': np.arange(3, 9),
    'max_depth': np.arange(8, 12),
}

model = GridSearchCV(
    estimator=RandomForestRegressor(),
    param_grid=parameters,
    scoring='r2',
    cv=5,
)
model

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [61]:
# 0.8685995690298893  -vadim, 0.8685995690298893-artem
model = RandomForestRegressor(max_depth=11, n_estimators=100, max_features=8, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
r2_score(y_train, y_pred)

0.8678484258119451

In [62]:
# 0.7488109805512833 -vadim, 0.7488453749310405 - artem   , 0.7491432139948219 -vadim +LS
y_valid_pred = model.predict(X_valid)
r2_score(y_valid, y_valid_pred)

0.7457147118869492

In [63]:
# сохраним модель
with open(MODEL_TRAIN_PATH, 'wb') as file:
    pickle.dump(model, file)

In [64]:
# full 3 comp  0.8609185713105426
model = RandomForestRegressor(max_depth=11, n_estimators=100, max_features=8, random_state=50)
model.fit(X, y)
y_pred_full = model.predict(X)
r2_score(y, y_pred_full)

0.8600110684359594

In [65]:
# сохраним модель
with open(MODEL_TRAIN_PATH_FULL, 'wb') as file:
    pickle.dump(model, file)

# Обучим Тестовую Модель

In [66]:
XTest

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,...,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B,components_3d_0,components_3d_1,components_3d_2,components_3d_3,components_3d_4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,11,...,1,0,1,0,1,0,0,1,0,0
15856,74,2.0,69.263183,45.348216,1.0,6,1.0,1977,0.075779,6,...,1,0,1,0,1,0,0,1,0,0
5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.000000,30,...,1,0,1,0,1,0,0,0,0,1
15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,23,...,1,0,1,0,1,1,0,0,0,0
14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,2,...,1,0,1,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8180,11,3.0,67.133911,50.809797,6.0,5,9.0,1973,0.000170,36,...,1,0,1,0,1,0,0,0,0,1
4695,1,1.0,40.198472,21.807061,10.0,12,17.0,2017,0.007122,1,...,1,0,1,0,1,0,0,1,0,0
5783,12,3.0,77.842178,48.282625,9.0,23,22.0,1989,0.090799,74,...,1,0,1,0,1,0,1,0,0,0
4780,62,2.0,81.305222,60.912515,0.0,4,0.0,1977,0.072158,2,...,1,0,1,1,0,0,0,1,0,0


In [67]:
# прочитаем нашу ранее сохраненную модель  
with open(MODEL_TRAIN_PATH_FULL, 'rb') as file:
    model=pickle.load(file)

In [68]:

Price_pred = model.predict(XTest)
Price_pred

array([171306.45898637, 218042.32348921, 209530.82788817, ...,
       328917.32342792, 202869.6242043 , 177346.30555072])

In [69]:
pd_Price=pd.DataFrame({'Id': XTest.index, 'Price':Price_pred})
pd_Price

Unnamed: 0,Id,Price
0,725,171306.458986
1,15856,218042.323489
2,5480,209530.827888
3,15664,344546.904292
4,14275,137736.821044
...,...,...
4995,8180,246870.091452
4996,4695,133828.695564
4997,5783,328917.323428
4998,4780,202869.624204


In [70]:
pd_Price.describe()

Unnamed: 0,Id,Price
count,5000.0,5000.0
mean,8412.5954,215743.681456
std,4832.674037,75359.35287
min,1.0,66787.367852
25%,4221.75,167734.463019
50%,8320.5,196432.117124
75%,12598.25,250503.740778
max,16795.0,559676.339647


In [71]:
# сохраняем препарированную базу без добавления обученых LifeSquare
pd_Price.to_csv('AKolyvanov_predictions_20200825-3.csv', index=False, encoding='utf-8')

In [74]:

a1=pd.read_csv('AKolyvanov_predictions_20200825-3.csv')

In [75]:
a0=pd.read_csv('AKolyvanov_predictions_20200825-2.csv')

In [76]:
r2_score(a1['Price'], a0['Price'])

1.0