#The goal of the task is to develop a service that will predict the cost of houses based on the history of offers

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 100)
import re

import os
for dirname, _, filenames in os.walk('/kaggle/input/home_prices/data.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import sys
from ast import literal_eval
import warnings
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import skew
from scipy import stats
from scipy.stats.stats import pearsonr
from scipy.stats import norm
from collections import Counter


RANDOM_SEED = 42

#Dataset preparing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
#ML tools import 
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV

import xgboost
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import optuna
%matplotlib inline

#import tools for metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score

import lightgbm as lgm
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

#import tools for neural network
import keras 
from keras import models as M
from keras import layers as L
from keras import backend as K
from keras import optimizers
from keras import initializers

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
#Functions list

#info rewiev 
def info_rewiev(col):
    print('Количество пропусков:{}'.format(col.isna().sum()))
    print()    
    print('Описание:{}'.format(col.describe()))
    print()
    print('Распределение:{}'.format(col.value_counts(dropna=False)))
    print()
    print('Значения:{}'.format(col.unique()))
    print()
    print('Уникальные значения:{}'.format(col.nunique(dropna=False)))
    print()
    
#for visualisation     
def visualisation(col):
    plt.figure(figsize=(15,6))
    col.value_counts().plot(kind='bar', color='blue')
    for i,val in enumerate(col.value_counts()):
        plt.text(i, val, int(val), horizontalalignment='left', verticalalignment='bottom', fontdict={'size':12})
    plt.show()

#history info
def hist_info(col):
    print('Количество пропусков:{}'.format(col.isna().sum()))
    print()
    print('Описание:{}'.format(col.describe()))
    print()
    print('Распределение:{}'.format(col.value_counts(dropna=False)))
    print()
    print('Значения:{}'.format(col.unique()))
    print()
    print('Уникальные значения:{}'.format(col.nunique(dropna=False)))
    print()
    plt.figure(figsize=(15,6))
    sns.countplot(x=col, data=data)
    plt.xticks(rotation = 'vertical')
    plt.show()
    
#metrics output
def rmse(y_true, y_pred):
    return np.sqrt(((y_true - y_pred) ** 2).mean())    

def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

#Dataset prewiev

In [None]:
data = pd.read_csv('/kaggle/input/home_prices/data.csv')
data.head()

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isna().sum()

In [None]:
bad_columns = []
bad_columns += ['private pool','fireplace','baths','stories','mls-id','PrivatePool']

In [None]:
data_dict = {}

#EDA (Status)

In [None]:
data.drop_duplicates(inplace=True)
data.drop(data[data.target.isna()].index, inplace=True)
data.shape

In [None]:
data.info('status')

In [None]:
info_rewiev(data['status'])

In [None]:
data['status'] = data['status'].apply(lambda x: str(x).lower())
data['status'].value_counts().head(40)

In [None]:
#Status modification function
def change_status(status):
    if status == 'for sale': return 'for sale'
    elif 'active' in status: return 'active'
    elif 'foreclosure' in status: return 'pre foreclosure'
    elif 'construction' in status: return 'new construction'
    elif 'pending' in status: return 'pending'
    elif status == 'nan': return 'no info'
    else: 
        return status
    
data['status_rating'] = data['status'].apply(lambda x: changed_status(x))
data['status_rating'].value_counts().head(10)


In [None]:
statuses = list(data['status_rating'].value_counts()[:7].keys())
statuses

In [None]:
def mod_status(status):
    if status in statuses:
        return status
    else: 
        return 'Other'
    
data['status_rating'] = data['status_rating'].apply(lambda x: mod_status(x))   

In [None]:
info_rewiev(data['status_rating'])

In [None]:
visualisation(data['status_rating'])


In [None]:
data_dict['status'] = 'Обработали'
data_dict['status_rating'] = 'Переводим в категории'

#EDA Private pool.

In [None]:
show_feature_info(data['private pool'])

In [None]:
#too many passes
data_dict['private pool'] = 'удалено'

#EDA propertyType

In [None]:
show_feature_info(data['propertyType'])

In [None]:
data['propertyType'].value_counts(dropna=False).head(50)

In [None]:
#make big letters small

data['propertyType'] = data['propertyType'].apply(lambda x: str(x).lower())

def change_property(prop):
    if prop == 'condo': return 'condo'
    elif prop == 'land': return 'land'
    elif 'family' in prop: return 'family'
    elif prop == 'nan': return 'no info'
    else: return 'Other'
    
data['property_rating'] = data['propertyType'].apply(lambda x: change_property(x))
data.head()

In [None]:
hist_info(data['property_rating'])

In [None]:
visualisation(data['property_rating'])

In [None]:
data_dict['propertyType'] = 'Обработано'
data_dict['property_rating'] = 'Переводим в категории'

In [None]:
#Create new column for counting words in original column
data['number_prop_descr'] = data['propertyType'].apply(lambda x: str(
    x).replace('/', ',').replace(', ', ',').replace('-', ',').replace(' ',','))
data['number_prop_descr'] = data['number_prop_descr'].apply(lambda x: len(str(x).split(',')))

In [None]:
data['number_prop_descr'].value_counts()

In [None]:
#function that returns the category to which we assign the value of the string
def number_prop_to_cat(prop):
    if prop == 3: return '3'
    elif prop == 2: return '2'
    elif prop == 1: return '1'
    else: return '4 or more'
    
data['number_prop_descr'] = data['number_prop_descr'].apply(lambda x: number_prop_to_cat(x))

In [None]:
hist_info(data['number_prop_descr'])

In [None]:
visualisation(data['number_prop_descr'])

In [None]:
data_dict['number_prop_descr'] = 'Обработали'

#EDA street

In [None]:
show_feature_info(data['street'])

In [None]:
data['street'].value_counts().head(30)

In [None]:
data['street'].fillna(data['street'].value_counts().head(1), inplace=True)

In [None]:
data['street'] = data['street'].apply(lambda x: str(x).lower())

In [None]:
def av_or_not(st):
    if st == 'address not disclosed' or st == 'undisclosed address' or st=='(undisclosed address)' \
    or st=='address not available' or st=='unknown address': return 'Unavailable ADD'
    else: return st
data['street'] = data['street'].apply(lambda x: av_or_not(x))

In [None]:
data['street_t'] = data['street'].apply(lambda x: str(x).split(' ')[-1:])

In [None]:
data['street_t'] = data['street_t'].apply(lambda x: str(x).replace('[','').replace(']',''))

In [None]:
data['street_t'].value_counts().head(30)

In [None]:

def street_type(s):
    if s == "'st'" or s == "'street'":
        return 'street'
    elif s == "'ave'" or s == "'avenue'":
        return 'avenue'
    elif s == "'rd'" or s == "'road'":
        return 'road'
    elif s == "'lane'" or s == "'ln'":
        return 'lane'
    elif s == "'dr'" or s == "'drive'":
        return 'drive'
    elif s == "'blvd'" or s == "'boulevard'":
        return 'boulevard'
    elif s == "'trail'" or s == "'trl'" or s == "'tr'":
        return 'trail'
    elif s == "'way'" or s == "'hwy'":
        return 'way'
    elif  s == "'cir'" or s == "'circle'":
        return 'circle'
    elif  s == "'ct'" or s == "'court'":
        return 'court'
    elif s == "'pl'" or s == "'place'":
        return 'place'
    elif s == "'[]'":
        return 'unknown'
    elif s == "'add'":
        return 'anavail'
    else:
        return 'other'

data['street_type'] = data['street_t'].apply(lambda x: street_type(x))
data.head()

In [None]:
hist_info(data['street_type'])

In [None]:
visualisation(data['street_type'])

In [None]:
data_dict['street'] = ' Обработано' 
data_dict['street_type'] = 'Переводим в категории ' 

#EDA baths

In [None]:
show_feature_info(data['baths'])

In [None]:
data['baths'] = data['baths'].apply(lambda x: str(x).lower())
data['baths'] = data['baths'].apply(lambda x: str(x).replace('baths','').replace('bathrooms','').replace('ba',''))

In [None]:
show_feature_info(data['baths'])

In [None]:
#rows format correction
data['baths'] = data['baths'].apply(lambda x: x.replace(',','.'))
data['baths'] = data['baths'].apply(lambda x: str(x).replace(
    '+', '').replace('~', '2').replace('-- ', '2').replace(': ', '').replace(' ', ''))
data['baths'] = data['baths'].apply(lambda x: str(x).replace('semimod','2').replace('sq.ft','2').replace('—','2'))
data['baths'] = data['baths'].apply(lambda x: str(x).replace(
    '2-1/2-1/1-1/1-1','4').replace('1/1-0/1-0/1-0','4').replace('0/0','0').replace(
    '1-0/1-0/1','3').replace('3-1/2-2','3').replace('116/116/116','116').replace('1/1/1/1','4').replace('1-2','2'))

In [None]:
visualisation(data['baths'])

In [None]:
data['baths'] = data['baths'].fillna('2')
data['baths'] = data['baths'].astype('float')

In [None]:
data['baths'].describe()

In [None]:
bath_25 = int(data.baths.quantile(25/100))
bath_50 = int(data.baths.quantile(50/100))
bath_75 = int(data.baths.quantile(75/100))

print('25 квантиль:',bath_25)
print('50 квантиль:', bath_50)
print('75 квантиль:', bath_75)

In [None]:
def bath_to_category(x):
    if x < bath_25:
        return '<2'
    elif bath_25 <= x <= bath_75: 
        return '2'
    else:
        return '>2'

data['number_bath'] = data['baths'].apply(lambda x: bath_to_category(x))

In [None]:
hist_info(data['number_bath'])

In [None]:
info_rewiev(data['number_bath'])

In [None]:
data_dict['baths'] = 'Обработано'
data_dict['number_bath'] = 'Переводим в категории'

#EDA homeFacts

In [None]:
info_rewiev(data['homeFacts'])

In [None]:
homefact_dict = literal_eval(data['homeFacts'][1])
homefact_dict

In [None]:
def hf(x):
    homefact_dict = literal_eval(x)
    lst = homefact_dict['atAGlanceFacts'][0].get('factValue')
    if lst:
        return str(lst)
    else:
        return np.nan
    
year = list(data[:].homeFacts.apply(hf))
year[:10]
data['year_built'] = year 

In [None]:
hist_info(data['year_built'])

In [None]:
def yr(year):
    if year == '559990649990': return data['year_built'].describe().top #max
    elif year == 'No Data': return data['year_built'].describe().top
    else: return year
data['year_built'] = data['year_built'].apply(yr).astype(float) 

In [None]:
info_rewiev(data['year_built'])

In [None]:
year_25 = int(data.year_built.quantile(25/100))
year_50 = int(data.year_built.quantile(50/100))
year_75 = int(data.year_built.quantile(75/100))

print('25 квантиль:',year_25)
print('50 квантиль:', year_50)
print('75 квантиль:', year_75)

In [None]:
def age_house(x):
    if x < year_25: return 'древние дома'
    elif year_25 <= x <= year_50: return 'относительно старые дома'
    elif year_50 < x <= year_75: return 'современные дома'
    elif x > year_75: return 'новостройки'
    else: return 'No info'
    
data['house_status'] = data['year_built'].apply(age_house)

In [None]:
hist_info(data['house_status'])

In [None]:
visualisation(data['house_status'])

In [None]:
data_dict['homeFacts'] = 'Обработали'
data_dict['homeFacts'] = 'Переводим в категории'

#EDA fireplace

In [None]:
info_rewiev(data['fireplace'])

In [None]:
data['fireplace'] = data['fireplace'].apply(lambda x: str(x).lower())

In [None]:
data['fireplace_grad'] = data['fireplace'].apply(lambda x: len(str(x).split(',')))
data['fireplace_grad'].value_counts()

In [None]:
info_rewiev(data['fireplace_grad'])

In [None]:
# make a dictionary
prior = ['1','2','3','4','5','6','one','two','three','four','five','six']

def fireplace(grad):
    if grad == 'yes': return 'available'
    elif grad in prior: return 'numeric_quantity'
    elif 'gas' in grad: return 'gas'
    elif 'fireplace' in grad: return 'fireplace'
    elif 'wood' in grad: return 'wood'
    elif 'ceiling' in grad: return 'ceiling'
    elif 'living' in grad: return 'room'
    elif 'dining' in grad: return 'room'
    elif 'kitchen' in grad:return 'room'
    elif 'room' in grad: return 'room'
    else: return 'other'
    
data['fireplace'] = data['fireplace'].apply(lambda x: fireplace(x))
data.head()

In [None]:
hist_info(data['fireplace'])

In [None]:
visualisation(data['fireplace'])

In [None]:
data['fireplace'] = data['fireplace'].apply(lambda x: x if x in ['other', 'available', 'numeric_quantity'] else 'something else')
show_plot(data['fireplace'])

In [None]:
data_dict['fireplace_grad'] = 'Обработали'
data_dict['fireplace'] = 'Переводим в категории'

#EDA city

In [None]:
info_rewiev(data['city'])

In [None]:
data['city'].value_counts().head(34)

In [None]:
#Let's make a grouping into cities with more ads and less
list_city=list(data['city'].value_counts()[:5].keys())
un_list_city=list(data['city'].value_counts()[5:30].keys())
def city_grad(c_size):
    if c_size in list_city:
        return 'list_city'
    elif c_size in un_list_city:
        return 'un_list_city'
    else:
        return 'min_list_city'
    
data['city_rating'] = data['city'].apply(lambda x: city_grad(x))
data.head()

In [None]:
hist_info(data['city_rating'])

In [None]:
visualisation(data['city_rating'])

In [None]:
data_dict['city'] = 'Обработано'
data_dict['city_rating'] = 'Переводим в категории'

In [None]:
data.info()

#EDA school

In [None]:
info_rewiev(data['schools'])

In [None]:
school_dict = literal_eval(data['schools'][0])
school_dict

In [None]:
def sc(x):
    school_dict = literal_eval(x)
    list_scool = school_dict[0]['data'].get('Distance')
    if list_scool:
        return list_scool
    else:
        return 'bla'
    
list_scool_dict = list(data[:].schools.apply(sc))
list_scool_dict[:10]

In [None]:
data['distance'] = list_scool_dict

In [None]:
len(data['distance'])

In [None]:
mean_distance=[]

In [None]:
#Calculate the average distance to school for families with children
for x in range(len(data['distance'])):
    try:
        path = data['distance'][x]
        path_list=[]
        for x in l:
            x =str(x).replace('mi','').replace('[','').replace(']','')
            path_list.append(x)
        test_list = list(map(float, path_list))
        mean_distance.append(sum(test_list)/len(test_list))
    except:
        mean_distance.append('no info')
        continue

In [None]:
len(mean_distance)

In [None]:
data['distance_upd'] = mean_distance
data['distance_upd'] = data['distance_upd'].apply(lambda x: str(x).replace('no info', '1.0')).astype(float)
data['distance_upd'] = data['distance_upd'].apply(lambda x: round(x,2))
show_feature_info(data['distance_upd'])

In [None]:
dist_25 = int(data.distance_upd.quantile(25/100))
dist_50 = int(data.distance_upd.quantile(50/100))
dist_75 = int(data.distance_upd.quantile(75/100))

print('25 квантиль:', dist_25)
print('50 квантиль:', dist_50)
print('75 квантиль:', dist_75)

In [None]:
# add a sign of distance to school
def path_schol(x):
    if x < dist_25:
        return 'убираем'
    elif dist_25 <= x <= dist_50:
        return 'подходит'
    elif dist_50 <= x <= dist_75:
        return 'далеко'
    else: 
        return 'очень далеко, не подходит!'
    
data['path_school'] = data['distance_upd'].apply(path_schol)
show_feature_info_hist(data['path_school'])

In [None]:
visualisation(data['path_school'])

#EDA sqft

In [None]:
info_rewiev(data['sqft'])

In [None]:
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace('Total interior livable area: ',''))
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace(' sqft ',''))
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace('sqft',''))
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace(' ',''))
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace('--', 'nan'))
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace('-',  ','))

In [None]:
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace('nan', '1000'))

def null(x):
    if x == '0':
        return '1000'
    else:
        return x
    
data['sqft'] = data['sqft'].apply(null)   

In [None]:
def create_sqft(x):
    x = re.sub('[^0-9]', '', x)
    x = int(x)
    return x

data['sqft'] = data['sqft'].apply(create_sqft)
data['sqft'].describe()

In [None]:
info_rewiev(data['sqft'])

In [None]:
data['sqft'] = data['sqft'].astype('float')

In [None]:
np.log(data['sqft']+1).hist(bins=100)

In [None]:
data['sqft'] = data['sqft'].apply(lambda x: np.log(x+1))

In [None]:
info_rewiev(data['sqft'])

In [None]:
data_dict['sqft'] = 'Обработан'

In [None]:
# create a sign how many schools around
valid_list = []
for x in list_scool_dict[:]:
    valid_list.append(len(x))

data['schools_around'] = valid_list

def valid(n):
    if n == 100:
        return 'no info'
    else:
        return n
    
data['schools_around'] = data['schools_around'].apply(valid)   

In [None]:
info_rewiev(data['schools_around'])

In [None]:
n_schools = list(data['schools_around'].value_counts()[0:9].keys())
n_schools

In [None]:
def nm_schools(n):
    if n in n_schools:
        return n
    else: return 'Other'

data['schools_around'] = data['schools_around'].apply(nm_schools)

In [None]:
hist_info(data['schools_around'])

In [None]:
visualisation(data['schools_around'])

In [None]:
data_dict['schools'] = 'Обработали'

#EDA zipcode

In [None]:
info_rewiev(data['zipcode'])

In [None]:
data_dict['zipcode'] = 'Удалим, мало иформации'

#EDA beds

In [None]:
info_rewiev(data['beds'])

In [None]:
#Let's clean up the data
data['beds'].value_counts().head(20)

In [None]:
data['beds'] = data['beds'].apply(lambda x: str(x)[:1])

In [None]:
data['beds'] = data['beds'].apply(lambda x: '1' if '1' in x else '2' if '2' in x else '3' if '3' in x 
                                  else '4' if '4' in x else '5' if '5' in x else '6/6+')

In [None]:
data['beds'].value_counts().head(20)

In [None]:
hist_info(data['beds'])

In [None]:
visualisation(data['beds'])

In [None]:
data_dict['beds'] = 'Переводим в категории'

#EDA state

In [None]:
info_rewiev(data['state'])

In [None]:
states_h=list(data['state'].value_counts()[:2].keys())
states_m=list(data['state'].value_counts()[2:7].keys())
states_l=list(data['state'].value_counts()[7:].keys())
def state_rang(name_st):
    if name_st in states_h:
        return 'states_h'
    elif name_st in states_m:
        return 'states_m'
    else:
        return 'states_l'        
data['state_rang'] = data['state'].apply(lambda x: state_rang(x))
data.head()

In [None]:
hist_info(data['state_rang'])

In [None]:
visualisation(data['state_rang'])

In [None]:
data_dict['state'] = 'Обработано'
data_dict['state_rating'] = 'Переводим в категории' 

#EDA mls-id

In [None]:
info_rewiev(data['mls-id'])

In [None]:
#Uninformative data
data_dict['mls-id'] = 'Удалим'

#EDA stories

In [None]:
info_rewiev(data['stories'])

In [None]:
data['stories'].value_counts(dropna=False).head(20)

In [None]:
data['stories'] = data['stories'].apply(lambda x: str(x).lower())
data['stories'] = data['stories'].apply(lambda x: 1 if '1' in str(x) else x)
data['stories'] = data['stories'].apply(lambda x: 1 if 'one' in str(x) else x)
data['stories'] = data['stories'].apply(lambda x: 2 if '2' in str(x) else x)
data['stories'] = data['stories'].apply(lambda x: 2 if 'two' in str(x) else x)
data['stories'] = data['stories'].apply(lambda x: 3 if '3' in str(x) else x)
data['stories'] = data['stories'].apply(lambda x: 3 if 'three' in str(x) else x)

In [None]:
data['stories'].value_counts().head(10)

In [None]:
def story_changed(story):
    if story == 1: return 'Two'
    elif story == 2: return 'One'
    elif story == 3: return 'Three'
    elif story == 'nan': return 'No info'
    else: return 'Other'
    
data['stories'] = data['stories'].apply(lambda x: story_changed(x))    

In [None]:
hist_info(data['stories'])

In [None]:
visualisation(data['stories'])

In [None]:
data_dict['stories'] = 'Переведем в категории'

#EDA PrivatePool

In [None]:
info_rewiev(data['PrivatePool'])

In [None]:
data['PrivatePool'] = data['PrivatePool'].apply(lambda x: str(x).lower())

def pool_yes(pool):
    if pool == 'yes': return 'info available'
    else: return 'no info'
    
data['PrivatePool'] = data['PrivatePool'].apply(lambda x: pool_yes(x))   

In [None]:
hist_info(data['PrivatePool'])

In [None]:
visualisation(data['PrivatePool'])

In [None]:
data_dict['PrivatePool'] = 'Обработано'

#EDA MlsId

In [None]:
info_rewiev(data['MlsId'])

In [None]:
# Uninvormative data
data_dict['MlsId'] = 'Удаляем'

#EDA target

In [None]:
info_rewiev(data['target'])

In [None]:
data['target'] = data['target'].apply(lambda x: str(x).replace('$',''))
data['target'] = data['target'].apply(lambda x: str(x).replace('+',''))
data['target'] = data['target'].apply(lambda x: str(x).replace(' - ',''))
data['target'] = data['target'].apply(lambda x: str(x).replace('-',''))
data['target'] = data['target'].apply(lambda x: str(x).replace('/mo','000'))
data['target'] = data['target'].apply(lambda x: str(x).replace('nan','225,000'))
data['target'] = data['target'].apply(lambda x: x.replace('27603-4374', '23229'))

In [None]:
def target_parsing(x):
    x = re.sub('[^0-9]', '', x)
    x = int(x)
    return x

data['target'] = data['target'].apply(target_parsing)
data['target'].describe

In [None]:
data.target.min(), data.target.median(), data.target.mean(), data.target.max() 

In [None]:
np.log(data['target']+1).plot(kind='hist', grid=True, use_index=True, legend=True, bins=50, figsize=(9,7))

In [None]:
data_dict['target'] = 'Целевая переменная'

In [None]:
target_25 = int(data.target.quantile(25/100))
target_50 = int(data.target.quantile(50/100))
target_75 = int(data.target.quantile(75/100))

print('25 квантиль:', target_25)
print('50 квантиль:', target_50)
print('75 квантиль:', target_75)

In [None]:
len(data[data['target'] < target_25])

In [None]:
len(data[data['target'] > target_75])

In [None]:
len(data[(data['target'] >= target_25) & (data['target'] <= target_75)])

In [None]:
boxplot = data.boxplot(column=['target'])

In [None]:
fig_dims = (8, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sns.distplot(data['target'], ax=ax, bins=50)

In [None]:
data = data.query("target != target.max()")

In [None]:
delta = target_75-target_25

def outliers_del(x):
    if x < (target_25 - 1.5*delta):
        return 0
    elif x > (target_75 + 1.5*delta):
        return 0
    else: return 1
    
data['outliers'] = data['target'].apply(lambda x: outliers_del(x))  

In [None]:
data['outliers'].value_counts()

In [None]:
data = data.query("outliers == 1")

In [None]:
data.target.min(), data.target.mean(), data.target.median(), data.target.max()

In [None]:
boxplot = data.boxplot(column=['target'])

#EDA competed

As you can see, the number of elements in one category differs from the number of elements in other categories, some data had to be excluded from further research. New features were created from some columns.

In [None]:
data.info()

In [None]:
data.head()

In [None]:
category_cols=[]
cols_drop=[]
number_cols=[]

cols_drop += ['status','private pool','propertyType','street','street_t','baths','homeFacts',
                'fireplace_grad','city','schools','zipcode','state','mls-id','MlsId','year_built','dv','distance']
category_cols += ['status_rating','property_rating','number_prop_descr','street_type','number_bath','fireplace',
            'city_rating','beds','state_rang','stories','PrivatePool', 'schools_around', 'house_status','path_school']
number_cols += ['sqft','distance_upd']

In [None]:
len(data.columns), len(cols_drop) + len(category_cols) + len(number_cols)

In [None]:
#remove these columns and process the rest
df=data.copy()

In [None]:
df.drop(['status','private pool','propertyType','street','street_t','baths','homeFacts', 'fireplace_grad','city','schools','zipcode','state','mls-id',
                 'MlsId','year_built','distance','outliers'], axis=1, inplace=True)


In [None]:
for colum in category_cols:
    df[colum] = df[colum].astype('category').cat.codes
    
df.head(2)

In [None]:
df.info()

In [None]:
df.sample(5)

In [None]:
df.describe()

In [None]:
exp_df=df.copy()

In [None]:
#Let's make dummy variables
df = pd.get_dummies(df, columns=['fireplace','beds','stories','PrivatePool','status_rating','property_rating', 'number_prop_descr','street_type','number_bath', 'city_rating','state_rang'] )
df

In [None]:
df[number_cols]

#Data preparation for ML

In [None]:
df.isna().sum()

In [None]:
X = df.drop(['target'], axis=1)
y = df['target'].astype(float)

#Splitting into train, validation and test in the appropriate ratio: 70%, 15%, 15%. 

In [None]:
X_train, X_to_test, y_train, y_to_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED)
X_val, X_test, y_val, y_test = train_test_split(X_to_test, y_to_test, test_size=0.5, random_state=RANDOM_SEED)

In [None]:
scaler = MinMaxScaler()
X_train[number_cols] = scaler.fit_transform(X_train[number_cols])
X_val[number_cols] = scaler.transform(X_val[number_cols])
X_test[number_cols] = scaler.transform(X_test[number_cols])

In [None]:
display(X_train[number_cols], X_val[number_cols], X_test[number_cols])

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_train.shape

In [None]:
correlations = X_train.corrwith(y_train).sort_values(ascending=False)

plt.figure(figsize=(20, 10))

plot = sns.barplot(y=correlations.index, x=correlations)

In [None]:
#Function to display feature weights for a model

def show_weights(features, weights, scales):
    fig, axs = plt.subplots(figsize=(14, 10), ncols=2)
    sorted_weights = sorted(zip(weights, features, scales), reverse=True)
    weights = [x[0] for x in sorted_weights]
    features = [x[1] for x in sorted_weights]
    scales = [x[2] for x in sorted_weights]
    sns.barplot(y=features, x=weights, ax=axs[0], orient='h')
    axs[0].set_xlabel("Weight")
    sns.barplot(y=features, x=scales, ax=axs[1], orient='h')
    axs[1].set_xlabel("Scale")
    plt.tight_layout()

In [None]:
#A function that calculates metrics from data: validation, predicted, train, predicted train

def show_metrics(y_true_val, y_pred_val, y_true_train, y_pred_train):
#calculates metrics
    mae_val = mean_absolute_error(y_true_val, y_pred_val)
    mae_train = mean_absolute_error(y_true_train, y_pred_train)
    mse_val = mean_squared_error(y_true_val, y_pred_val)
    mse_train = mean_squared_error(y_true_train, y_pred_train)
    rmse_val = rmse(y_true_val, y_pred_val)
    rmse_train = rmse(y_true_train, y_pred_train)
    mape_val = mape(y_true_val, y_pred_val)
    mape_train = mape(y_true_train, y_pred_train)
    
#metrics output
    print(f"Val MAE: {mae_val:0.2f}")
    print(f"Train MAE: {mae_train:0.2f}")
    print()
    print(f"Val MSE: {mse_val:0.2f}")
    print(f"Train MSE: {mse_train:0.2f}")
    print()
    print(f"Val RMSE: {rmse_val:0.2f}")
    print(f"Train RMSE: {rmse_train:0.2f}")
    print()
    print(f"Val MAPE: {mape_val:0.2f}")
    print(f"Train MAPE: {mape_train:0.2f}")
    print()
    
#add metrics in list
    mae_val_list.append(mae_val)
    mae_train_list.append(mae_train)
    mse_val_list.append(mse_val)
    mse_train_list.append(mse_train)
    rmse_val_list.append(rmse_val)
    rmse_train_list.append(rmse_train)
    mape_val_list.append(mape_val)
    mape_train_list.append(mape_train)
    
#r2_score correction    
def r2_corr(model, name):
    r2_val = model.score(X_val, np.log(y_val))
    r2_train = model.score(X_train, np.log(y_train))
    print('r2_score  - val/train: ',round(r2_val,4), round(r2_train,4)) 
    
#r2 after correction
    r2_val_corr = 1 - (1-r2_val)*(len(y_val)-1)/(len(y_val)-X_val.shape[1]-1)
    r2_train_corr = 1 - (1-r2_train)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
    print('r2_score_corrected - val/train: ', round(r2_val_corr,4), round(r2_train_corr,4))  
    
#add in list
    r2_val_list.append(r2_val)
    r2_train_list.append(r2_train)
    r2_val_corr_list.append(r2_val_corr)
    r2_train_corr_list.append(r2_train_corr)
    model_name_list.append(name)

In [None]:
#Lists to store results
model_name_list=[]
mae_val_list = []; mae_train_list = []
mse_val_list = []; mse_train_list = []
rmse_val_list = []; rmse_train_list = []
mape_val_list = []; mape_train_list= []    
r2_val_list = []; r2_train_list = []
r2_val_corr_list = []; r2_train_corr_list = []

In [None]:
y_pred_naive=y_train.mean()
y_pred_naive

In [None]:
# lets make baseline

In [None]:
y_val_pred_list = [y_pred_naive for x in range(len(y_val))]
y_train_pred_list = [y_pred_naive for x in range(len(y_train))]

len(y_val_pred_list), len(y_train_pred_list)

In [None]:
mae_naive = mean_absolute_error(y_val, y_val_pred_list)
mae_naive = mean_absolute_error(y_train, y_train_pred_list)
mse_naive = mean_squared_error(y_val, y_val_pred_list)
mse_naive = mean_squared_error(y_train, y_train_pred_list)
    
print(f"Val MAE: {mae_naive:0.2f}")
print(f"Train MAE: {mae_naive:0.2f}")
print()
print(f"Val MSE: {mse_naive:0.2f}")
print(f"Train MSE: {mse_naive:0.2f}")
print()

#baseline

#LinearRegression and metrics

In [None]:
lr = LinearRegression(n_jobs=-1)
lr.fit(X_train, np.log(y_train))
predict_lr_val = lr.predict(X_val)
predict_lr_val = np.exp(predict_lr_val)
predict_lr_train = lr.predict(X_train)
predict_lr_train = np.exp(predict_lr_train)


In [None]:
show_metrics(y_val, predict_lr_val, y_train, predict_lr_train)
print()
r2_corr(lr, 'LinearRegression')

In [None]:
show_weights(X.columns, lr.coef_, X_train.std())

#GradientBoosting and metrics

In [None]:
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, np.log(y_train))
predict_gb_val = gb.predict(X_val)
predict_gb_val = np.exp(predict_gb_val)
predict_gb_train = gb.predict(X_train)
predict_gb_train = np.exp(predict_gb_train)

show_metrics(y_val, predict_gb_val, y_train, predict_gb_train)
print()
r2_corr(gb, 'GradientBoostingRegressor')

In [None]:
show_weights(X.columns, gb.feature_importances_, X_train.std())

In [None]:
gb_t = GradientBoostingRegressor(max_depth=15, min_samples_leaf=15,  min_samples_split=10, max_features='auto', random_state=42)
gb_t.fit(X_train, np.log(y_train))
predict_gb_t_val = gb_t.predict(X_val)
predict_gb_t_val = np.exp(predict_gb_t_val)
predict_gb_t_train = gb_t.predict(X_train)
predict_gb_t_train = np.exp(predict_gb_t_train)

show_metrics(y_val, predict_gb_t_val, y_train, predict_gb_t_train)
print()
r2_corr(gb_t, 'GradientBoostingRegressor_tune')

In [None]:
show_weights(X.columns, gb_t.feature_importances_, X_train.std())

#RandomForest and metrics

In [None]:

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, np.log(y_train))
predict_rf_val = rf.predict(X_val)
predict_rf_val = np.exp(predict_rf_val)
predict_rf_train = rf.predict(X_train)
predict_rf_train = np.log(predict_rf_train)

show_metrics(y_val, predict_rf_val,y_train, predict_rf_train )
print()
r2_corr(rf, 'RandomForestRegressor')

In [None]:
show_weights(X.columns, rf.feature_importances_, X_train.std())

In [None]:
rf_t = RandomForestRegressor(max_depth=110, min_samples_split=10, min_samples_leaf = 5, max_features='auto',\
bootstrap=True, oob_score=True, n_jobs=-1, random_state=42)
rf_t.fit(X_train, np.log(y_train))
predict_rf_t_val = rf_t.predict(X_val)
predict_rf_t_val = np.exp(predict_rf_t_val)
predict_rf_t_train = rf_t.predict(X_train)
predict_rf_t_train = np.log(predict_rf_t_train)

show_metrics(y_val, predict_rf_t_val,y_train, predict_rf_t_train )
print()
r2_corr(rf_t, 'RandomForestRegressor_tune1')

In [None]:
show_weights(X.columns, rf_t.feature_importances_, X_train.std())

#CatBoost and metrics

In [None]:

ctb = CatBoostRegressor(iterations = 5000, random_seed = RANDOM_SEED, eval_metric='MAPE', custom_metric=['R2', 'MAE'], silent=True, )
ctb.fit(X_train, np.log(y_train),
         eval_set=(X_val, np.log(y_val)),
         verbose_eval=0,
         use_best_model=True, )

predict_ctb_val = ctb.predict(X_val)
predict_ctb_val = np.exp(predict_ctb_val)
predict_ctb_train = ctb.predict(X_train)
predict_ctb_train = np.exp(predict_ctb_train)

In [None]:
show_metrics(y_val, predict_ctb_val, y_train, predict_ctb_train)
print()
r2_corr(ctb, 'CatBoostRegressor')

In [None]:
show_weights(X.columns, ctb.feature_importances_, X_train.std())

#Let's summarize all the information in a dataframe

In [None]:

feat_full=pd.DataFrame(index=model_name_list)
feat_full['mae_val'] = mae_val_list
feat_full['mae_train'] = mae_train_list
feat_full['mse_val'] = mse_val_list
feat_full['mse_train'] = mse_train_list
feat_full['r2_val'] = r2_val_list
feat_full['r2_train'] = r2_train_list
feat_full['r2_val_corr'] = r2_val_list
feat_full['r2_train_corr'] = r2_train_list
feat_full['rmse_val'] = rmse_val_list
feat_full['rmse_train'] = rmse_train_list
feat_full['mape_val'] = mape_val_list
feat_full['mape_train'] = mape_train_list

feat_full

In [None]:
from sklearn.model_selection import cross_val_score
all_scores_lr=cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
all_scores_rf=cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
all_scores_rf_t=cross_val_score(rf_t, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
all_scores_gb=cross_val_score(gb, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
all_scores_gb_t=cross_val_score(gb_t, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

In [None]:
print('LR: MAE mean on cv: {}, MAE std on cv: {}'.format(all_scores_lr.mean(), all_scores_lr.std()))
print('RF: MAE mean on cv: {}, MAE std on cv: {}'.format(all_scores_rf.mean(), all_scores_rf.std()))
print('RF_tuned: MAE mean on cv: {}, MAE std on cv: {}'.format(all_scores_rf_t.mean(), all_scores_rf_t.std()))
print('GB: MAE mean on cv: {}, MAE std on cv: {}'.format(all_scores_gb.mean(), all_scores_gb.std()))
print('GB_tuned: MAE mean on cv: {}, MAE std on cv: {}'.format(all_scores_gb_t.mean(), all_scores_gb_t.std()))

In [None]:
import lightgbm as lgm
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
model = lgm.LGBMRegressor()

cv = KFold(n_splits=5, random_state=1, shuffle=True)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')

print('MAE mean: %.3f, MAE std: %.3f' % (n_scores.mean(), n_scores.std()))

In [None]:
model = CatBoostRegressor()

cv = KFold(n_splits=5, random_state=1, shuffle=True)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')

print('MAE std: %.3f, MAE std: %.3f' % (n_scores.mean(), n_scores.std()))

What is done:
- cleared the data and divided it into train, valid, test; did cross-validation.
- carried out analysis using linear regression, random forest, gradient boosting, categorical boosting with hyperparameter tuning;
- according to the output of the metrics, we saw the best: CatBoost, GradientBoosting (tuned) for prediction on test data.

#Let's check the prediction on the test

In [None]:


def show_metrics_test(y_true_test, y_pred_test, model):
#calculate metrics
    mae_test = mean_absolute_error(y_true_test, y_pred_test)
    mse_test = mean_squared_error(y_true_test, y_pred_test)
    rmse_test = rmse(y_true_test, y_pred_test)
    mape_test = mape(y_true_test, y_pred_test)
    r2_test = model.score(X_test, np.log(y_test))
    r2_test_corr = 1 - (1-r2_test)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
    
#output metrics
    print(f"Test MAE: {mae_test:0.2f}")
    print(f"Test MSE: {mse_test:0.2f}")
    print(f"Test RMSE: {rmse_test:0.2f}")
    print(f"Test MAPE: {mape_test:0.2f}")
    print(f"Test R2: {r2_test:0.3f}")
    print(f"Test R2_corr: {r2_test_corr:0.3f}")

#Prediction based on tuned gradient boosting

In [None]:

predict_gb_test = gb_t.predict(X_test)
predict_gb_test = np.exp(predict_gb_test)
show_metrics_test(y_test, predict_gb_test, gb_t)

#Prediction based on CatBoost

In [None]:
predict_ctb_test = ctb.predict(X_test)
predict_ctb_test = np.exp(predict_ctb_test)
show_metrics_test(y_test, predict_ctb_test, ctb)

As you can see, GradientBoosting is better in MAE, and CatBoost is better in MAPE metrics. Based on the purpose of the task, it is better to look at the MAE indicator

#Neural network. Model training.

In [None]:
#Function for building a model with input data layers

def our_build_model():
    model  = M.Sequential()
    model.add(L.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(L.Dense(64, activation='relu'))
    model.add(L.Dense(64, activation='relu'))
    model.add(L.Dense(32, activation='relu'))
    model.add(L.Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

nnr=our_build_model()

#We train
history = nnr.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=20)
 
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
accuracy = history_dict['mae']
val_accuracy = history_dict['val_mae']
 
epochs = range(1, len(loss_values) + 1)
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
ax[1].plot(epochs, loss_values, 'bo', label='Training loss')
ax[1].plot(epochs, val_loss_values, 'b', label='Validation loss')
ax[1].set_title('Training & Validation Loss', fontsize=16)
ax[1].set_xlabel('Epochs', fontsize=16)
ax[1].set_ylabel('Loss', fontsize=16)
ax[1].legend()

In [None]:
plt.title('Loss')
plt.plot(history.history['mae'], label='train')
plt.plot(history.history['val_mae'], label='test')
plt.show();

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.show();

In [None]:
#Let's make lists for class instance parameters and metrics

r2_test_lst = []; r2_train_lst = []
mae_test_lst = []; mae_train_lst = []
mse_test_lst = []; mse_train_lst = []

#layers
l1_lst = []; l2_lst = []; l3_lst = []; l4_lst = []

#Model name
model_name_lst = []

#Initialisation
init_1 = initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=12345)
init_2 = initializers.Constant(value=1e-3)
init_3 = initializers.RandomNormal(mean=0.0, stddev=0.05, seed=123456)

In [None]:
class TryModel:
    
#Layer initialization, activation method, model name
    def __init__(self, layer1, layer2, layer3, layer4, activation1, activation2, model_name, simple):
        self.__layer1 = layer1
        self.__layer2 = layer2
        self.__layer3 = layer3
        self.__layer4 = layer4
        self.__activation1 = activation1
        self.__activation2 = activation2
        self.__model_name = model_name
        self.__simple = simple
        
#Save the entered model parameters to the lists
    def save_model_param(self):
        l1_lst.append(self.__layer1)
        l2_lst.append(self.__layer2)
        l3_lst.append(self.__layer3)
        l4_lst.append(self.__layer4)
        model_name_lst.append(self.__model_name)
        print('Параметры модели сохранены')
        
        
#build and train the model
    def our_build_model(self):
        
        if self.__simple == 0:
#set the model
            model = M.Sequential()
            model.add(L.Dense(self.__layer1, activation=self.__activation1, input_shape=(X_train.shape[1],)))
            model.add(L.Dense(self.__layer2, activation=self.__activation1))
            model.add(L.Dense(self.__layer3, activation=self.__activation1))
            model.add(L.Dense(self.__layer4, activation=self.__activation1))
            model.add(L.Dense(1, activation=self.__activation2))
            model.compile(optimizer='adam', loss='mse', metrics=['mae'])
            model.summary()
            self.model = model
            print('Модель создана')
        
#Train the model and make predictions
            model.fit(X_train, np.log(y_train+1), verbose=1)
            
            pred_test = model.predict(X_val)
            pred_test = np.exp(pred_test)
            pred_train = model.predict(X_train)
            pred_train = np.exp(pred_train)
            self.__pred_test = pred_test
            self.__pred_train = pred_train
            K.clear_session()
            print('Модель обучена')
            
        elif self.__simple == 1:
#set the model
            model = M.Sequential()
            model.add(L.Dense(self.__layer1, activation=self.__activation1, input_shape=(X_train.shape[1],)))
            model.add(L.Dropout(0.2))
            model.add(L.Dense(self.__layer2, activation=self.__activation1))
            model.add(L.Dropout(0.2))
            model.add(L.Dense(self.__layer3, activation=self.__activation1))
            model.add(L.Dropout(0.2))
            model.add(L.Dense(self.__layer4, activation=self.__activation1))
            model.add(L.Dropout(0.2))
            model.add(L.Dense(1, activation=self.__activation2))
            model.compile(optimizer='adam', loss='mse', metrics=['mae'])
            model.summary()
            self.model = model
            print('Модель создана')
            
#Train the model and make predictions
            model.fit(X_train, np.log(y_train), verbose=1)
            
            pred_test = model.predict(X_val)
            pred_test = np.exp(pred_test)
            pred_train = model.predict(X_train)
            pred_train = np.exp(pred_train)
            self.__pred_test = pred_test
            self.__pred_train = pred_train
            K.clear_session()
            print('Модель обучена')
            
        elif self.__simple == 2:
#set the model
            model = M.Sequential()
            model.add(L.Dense(self.__layer1, activation=self.__activation1, input_shape=(X_train.shape[1],), kernel_initializer=init_1,
                      bias_initializer=init_2))
            model.add(L.Dropout(0.2))
            model.add(L.Dense(self.__layer2, activation=self.__activation1, kernel_initializer=init_1,
                      bias_initializer=init_2))
            model.add(L.Dropout(0.2))
            model.add(L.Dense(self.__layer3, activation=self.__activation1, kernel_initializer=init_1,
                      bias_initializer=init_2))
            model.add(L.Dropout(0.2))
            model.add(L.Dense(self.__layer4, activation=self.__activation1, kernel_initializer=init_1,
                      bias_initializer=init_2))
            model.add(L.Dropout(0.2))
            model.add(L.Dense(1, activation=self.__activation2))
            model.compile(optimizer='adam', loss='mse', metrics=['mae'])
            model.summary()
            self.model = model
            print('Модель создана')
            
#Train the model and make predictions
            model.fit(X_train, np.log(y_train+1), verbose=1)
            
            pred_test = model.predict(X_val)
            pred_test = np.exp(pred_test)
            pred_train = model.predict(X_train)
            pred_train = np.exp(pred_train)
            self.__pred_test = pred_test
            self.__pred_train = pred_train
            K.clear_session()
            print('Модель обучена')
            
        elif self.__simple == 3:   
#Set the model
            model = M.Sequential()
            model.add(L.Dense(self.__layer1, activation=self.__activation1, input_shape=(X_train.shape[1],), kernel_initializer=init_1,
                      bias_initializer=init_2))
            model.add(L.Dense(self.__layer2, activation=self.__activation1, kernel_initializer=init_1,
                      bias_initializer=init_2))
            model.add(L.Dense(self.__layer3, activation=self.__activation1, kernel_initializer=init_1,
                      bias_initializer=init_2))
            model.add(L.Dense(self.__layer4, activation=self.__activation1, kernel_initializer=init_1,
                      bias_initializer=init_2))
            model.add(L.Dense(1, activation=self.__activation2))
            model.compile(optimizer='adam', loss='mse', metrics=['mae'])

            model.summary()
            self.model = model
            print('Модель создана')
            
#Train the model and make predictions
            model.fit(X_train, np.log(y_train), verbose=1)
            
            pred_test = model.predict(X_val)
            pred_test = np.exp(pred_test)
            pred_train = model.predict(X_train)
            pred_train = np.exp(pred_train)
            self.__pred_test = pred_test
            self.__pred_train = pred_train
            K.clear_session()
            print('Модель обучена')
            
        else:
            print('Ошибка')
        
#metrics output 
    def show_metrics(self):
        print('r2_score on test:', round(r2_score(y_val, self.__pred_test)*100,3))
        print('r2_score on train:', round(r2_score(y_train, self.__pred_train)*100,3))  
        print('mae on test:', round(mean_absolute_error(y_val, self.__pred_test)*100,3))
        print('mae on train:', round(mean_absolute_error(y_train, self.__pred_train)*100,3))
        print('mse on test:', round(mean_squared_error(y_val, self.__pred_test)*100,3))
        print('mse on train:', round(mean_squared_error(y_train, self.__pred_train)*100,3))
  
    def saved_metrics(self ):
        r2_test_lst.append(round(r2_score(y_val, self.__pred_test)*100,3))
        r2_train_lst.append(round(r2_score(y_train, self.__pred_train)*100,3))
        mae_test_lst.append(round(mean_absolute_error(y_val, self.__pred_test)*100,3))
        mae_train_lst.append(round(mean_absolute_error(y_train, self.__pred_train)*100,3))
        mse_test_lst.append(round(mean_squared_error(y_val, self.__pred_test)*100,3))
        mse_train_lst.append(round(mean_squared_error(y_train, self.__pred_train)*100,3))
        print('Метрики {r2, mae, mse} добавлены на тесте и трейне')
    
#train visualisation
    def plot_history(self):
        history = self.model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=20)
        plt.title('Loss')
        plt.plot(history.history['mae'], label='train')
        plt.plot(history.history['val_mae'], label='test')
        plt.show();
        plt.title('Loss')
        plt.plot(history.history['loss'], label='train')
        plt.plot(history.history['val_loss'], label='test')
        plt.show();

In [None]:
#Let's create an instance of the class
model1 = TryModel(512,256,256,64,'relu','linear', 'model_1',1)

#Let's use class methods
model1.our_build_model()
model1.show_metrics()
model1.saved_metrics()
model1.save_model_param()


In [None]:
model2 = TryModel(128,256,64,32,'relu','linear', 'model_2',2)  
model2.our_build_model()
model2.show_metrics()
model2.saved_metrics()
model2.save_model_param()

In [None]:
model3 = TryModel(128,256,64,32,'relu','linear', 'model_3', 3)  
model3.our_build_model()
model3.show_metrics()
model3.saved_metrics()
model3.save_model_param()

In [None]:
model3 = TryModel(128,256,64,32,'relu','linear', 'model_3', 4)  
model3.our_build_model()
model3.show_metrics()
model3.saved_metrics()
model3.save_model_param()

In [None]:
class_df = pd.DataFrame(index=model_names_lst)

class_df['l1'] = l1_lst; class_df['l2'] = l2_lst; class_df['l3'] = l3_lst; class_df['l4'] = l4_lst
class_df['r2_test'] = r2_test_lst; class_df['r2_train'] = r2_train_lst
class_df['test_MAE'] = mae_test_lst; class_df['train_MAE'] = mae_train_lst
class_df['test_MSE'] = mse_test_lst; class_df['train_MSE'] = mse_train_lst

class_df

#Conclusions on the use of the neural network


- Making EDA, feature engineering.
- We built various classical ML models to solve the regression problem: we did a search of the models and obtained metrics for each built model.
- We built a neural network to solve the problem, found the best architecture and compared the models with each other.

 We can try other ways like TabNet or KFold.