In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import r2_score as r2, accuracy_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

DATASET_PATH_TRAIN = 'train.csv'
DATASET_PATH_TEST = 'test.csv'
PREPARED_DATASET_PATH = 'VSerduykov_predictions.csv'
df_train = pd.read_csv(DATASET_PATH_TRAIN)
df_test = pd.read_csv(DATASET_PATH_TEST) 

In [2]:
# функция заменяет значения в котегориальных признаках
def test_job(df_tests):
    df_tests.replace({'Ecology_2':{'A':0, 'B':1}, 'Shops_2':{'A':0, 'B':1}, 'Ecology_3':{'A':0, 'B':1}}, inplace=True)
    return df_tests

In [3]:
# заменяем LifeSquare в случе отсутствия либо меньшей площади чем общая на значение (площадь - площадь кухни и *0,85) 
def fill_Life_Square(df_train):
    df_train.loc[(df_train['Square'] < df_train['LifeSquare']) | (df_train['LifeSquare'].isna()), 'LifeSquare'] =\
    (df_train.loc[(df_train['Square'] < df_train['LifeSquare']) | (df_train['LifeSquare'].isna()), 'Square'] -  
     df_train.loc[(df_train['Square'] < df_train['LifeSquare']) | (df_train['LifeSquare'].isna()), 'KitchenSquare'])*0.85
    return df_train

In [4]:
df_train = test_job(df_train)

In [5]:
df_train = fill_Life_Square(df_train)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['Id', 'Healthcare_1', 'Price'], axis='columns'),
                                                    df_train['Price'], test_size=0.2,
                                                    random_state=22)

In [7]:
model_RFR = RandomForestRegressor(max_depth=15, n_estimators=200, random_state=55)

In [8]:
model_RFR.fit(X_train, y_train)
predicts=model_RFR.predict(X_test)
r2(y_test, predicts)

0.7102740943337091

In [9]:
model_RFR.fit(df_train.drop(['Id', 'Healthcare_1', 'Price', 'Shops_2', 'Ecology_3', 'Ecology_2'], 
                            axis='columns'), df_train['Price'])
df_test = test_job(df_test)
df_test = fill_Life_Square(df_test)
rez = model_RFR.predict(df_test.drop(['Id', 'Healthcare_1', 'Shops_2', 'Ecology_3', 'Ecology_2'], axis='columns'))
df_test['Price']=rez
df_test[['Id', 'Price']].to_csv(PREPARED_DATASET_PATH, index=False)