In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform, truncnorm, randint
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('.\models\RFR\output.csv')
df.describe()

Unnamed: 0,floor,total_floor_num,rent_price,room_num,area,year,year_renovation,top_floor,oldtown
count,2302.0,2302.0,2302.0,2302.0,2302.0,2302.0,80.0,2302.0,2302.0
mean,3.377932,5.809296,523.900956,2.02563,53.301108,1990.491746,2013.175,0.216334,0.205039
std,2.249265,3.226175,235.495612,0.789294,21.369041,25.717765,7.577072,0.411834,0.403818
min,0.0,1.0,155.0,1.0,16.0,1903.0,1965.0,0.0,0.0
25%,2.0,4.0,360.0,1.0,38.0,1975.0,2011.75,0.0,0.0
50%,3.0,5.0,460.0,2.0,50.0,2000.5,2016.0,0.0,0.0
75%,4.0,7.0,650.0,2.0,65.0,2011.0,2017.25,0.0,0.0
max,20.0,29.0,1480.0,6.0,150.0,2019.0,2019.0,1.0,1.0


In [6]:
df.head()

Unnamed: 0,Adresas,floor,total_floor_num,rent_price,room_num,year_full,house_type,area,heating,year,year_renovation,top_floor,oldtown,district
0,https://www.aruodas.lt/butu-nuoma-vilniuje-sen...,4.0,6.0,890,3.0,2017,Monolitinis,76.0,Centrinis kolektorinis,2017,,0,1.0,senamiestyje
1,https://www.aruodas.lt/butu-nuoma-vilniuje-laz...,2.0,4.0,440,2.0,2008,Mūrinis,68.0,Centrinis kolektorinis,2008,,0,0.0,lazdyneliuose
2,https://www.aruodas.lt/butu-nuoma-vilniuje-sen...,2.0,3.0,811,3.0,1940,Mūrinis,64.0,Dujinis,1940,,0,1.0,senamiestyje
3,https://www.aruodas.lt/butu-nuoma-vilniuje-jus...,4.0,5.0,310,2.0,1985,Blokinis,50.0,Centrinis,1985,,0,0.0,justiniskese
4,https://www.aruodas.lt/butu-nuoma-vilniuje-nau...,4.0,9.0,380,2.0,2017,Mūrinis,34.0,Centrinis kolektorinis,2017,,0,0.0,naujamiestyje


## encoder one-hot

In [7]:
# http://www.insightsbot.com/python-one-hot-encoding-with-scikit-learn/
df["district"].fillna("None", inplace = True)
district_ohe = OneHotEncoder()

X = district_ohe.fit_transform(df.district.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = [district_ohe.get_feature_names()[i].replace("x0_", "") for i in range(X.shape[1])])
df = pd.concat([df, dfOneHot], axis=1)

## pipline and model

In [8]:
Pipeline_rfr = Pipeline(steps = [
    ("impute", SimpleImputer(missing_values=np.nan)), 
    ("rfr", RandomForestRegressor(random_state=42))
])

## train_test_split

In [9]:
df_train = df.select_dtypes(include=['float64','int64'])
df_features = df_train.drop(columns="rent_price")
df_target = df_train["rent_price"]
X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, random_state=42)

In [8]:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'rfr__n_estimators': n_estimators,
               'rfr__max_features': max_features,
               'rfr__max_depth': max_depth,
               'rfr__min_samples_split': min_samples_split,
               'rfr__min_samples_leaf': min_samples_leaf,
               'rfr__bootstrap': bootstrap}


# max_features = ['auto', 'sqrt']
# random_grid = {'rfr__max_features': max_features}


rf_random = RandomizedSearchCV(Pipeline_rfr, param_distributions = random_grid, cv = 5, scoring="neg_mean_squared_error", n_iter=100)

In [9]:
rf_random.fit(X_train, y_train)
rf_random.best_estimator_

Pipeline(memory=None,
         steps=[('impute',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('rfr',
                 RandomForestRegressor(bootstrap=False, criterion='mse',
                                       max_depth=30, max_features='sqrt',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=1600, n_jobs=None,
                                       oob_score=False, random_state=42,
                                       verbose=0, warm_start=False))],
         verbose=False)

In [10]:
print("RSME:", int((mean_squared_error(y_test, rf_random.predict(X_test)))**0.5))
print("MAE:", int((mean_absolute_error(y_test, rf_random.predict(X_test)))))

RSME: 88
MAE: 58


In [11]:
print("RSME:", int((mean_squared_error(y_train, rf_random.predict(X_train)))**0.5))
print("MAE:", int((mean_absolute_error(y_train, rf_random.predict(X_train)))))

RSME: 18
MAE: 4


In [12]:
feature_importances = rf_random.best_estimator_.named_steps['rfr'].feature_importances_
model = rf_random.best_estimator_.named_steps['rfr']
feature_names = X_train.columns

In [18]:
for importances, names in zip(feature_importances, feature_names):
    print(names,";", round(importances,3))

floor ; 0.041
total_floor_num ; 0.059
room_num ; 0.164
area ; 0.313
year ; 0.171
year_renovation ; 0.006
top_floor ; 0.01
oldtown ; 0.07
0 ; 0.001
antakalnyje ; 0.004
aukstuosiuose-paneriuose ; 0.0
avizieniuose ; 0.001
bajoruose ; 0.001
balsiuose ; 0.0
baltupiuose ; 0.003
burbiskes ; 0.0
fabijoniskese ; 0.003
filaretuose ; 0.0
grigiskese ; 0.0
jeruzaleje ; 0.001
justiniskese ; 0.003
kalnenuose ; 0.0
karoliniskese ; 0.002
lazdyneliuose ; 0.001
lazdynuose ; 0.002
markuciuose ; 0.001
naujamiestyje ; 0.018
naujininkuose ; 0.002
naujojoje-vilnioje ; 0.003
pasilaiciuose ; 0.006
pavilnyje ; 0.0
pilaiteje ; 0.005
santariskese ; 0.001
sauletekyje ; 0.0
senamiestyje ; 0.058
seskineje ; 0.003
siaures-miestelyje ; 0.001
snipiskese ; 0.007
tarandeje ; 0.0
traku ; 0.003
uzupyje ; 0.019
valakampiuose ; 0.002
verkiuose ; 0.0
vilkpedeje ; 0.0
virsuliskese ; 0.001
visoriuose ; 0.0
zemuosiuose-paneriuose ; 0.001
zirmunuose ; 0.002
zveryne ; 0.008


## Gradient Boosting regression

In [None]:
Pipeline_gbrt = Pipeline(steps = [
    ("impute", SimpleImputer(missing_values=np.nan)), 
    ("gbrt", GradientBoostingRegressor(random_state=42))
])

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
learn_rates = [0.02, 0.05, 0.06]
max_depths = [int(x) for x in np.linspace(10, 150, num = 11)]
max_depths.append(None)
min_samples_leaf = [5,10,15,25,50]
min_samples_split = [5,10,25,50]

params = {'gbrt__n_estimators': n_estimators,
              'gbrt__learning_rate': learn_rates,
              'gbrt__max_depth': max_depths,
              'gbrt__min_samples_leaf': min_samples_leaf,
              'gbrt__min_samples_split': min_samples_split}

gbrt_random = RandomizedSearchCV(Pipeline_gbrt,param_distributions = params, random_state=42)
gbrt_random.fit(X_train, y_train)
gbrt_random.best_estimator_

In [None]:
print("RSME:", int((mean_squared_error(y_test, gbrt_random.predict(X_test)))**0.5))
print("MAE:", int((mean_absolute_error(y_test, gbrt_random.predict(X_test)))))

## export

In [19]:
import joblib
joblib.dump(rf_random, 'RFR model.pkl', compress=9)
# model = joblib.load('RFR 88.pkl')
# X_train

['RFR model.pkl']

In [None]:
# predicted_prices = model.predict(df_features)
# my_submission = pd.DataFrame({'area': df_features.area, 'predicted_prices': predicted_prices})
# my_submission.to_csv('predicted_prices.csv', index=False)

In [None]:
# df_features.to_csv('full.csv', index=False)
# df_target.to_csv('full target.csv', index=False)
# df.to_csv('df.csv', index=False)

In [11]:
X_train.iloc[0]

floor                          1.00
total_floor_num                3.00
room_num                       2.00
area                          47.45
year                        1985.00
year_renovation                 NaN
top_floor                      0.00
oldtown                        0.00
0                              0.00
antakalnyje                    0.00
aukstuosiuose-paneriuose       0.00
avizieniuose                   0.00
bajoruose                      0.00
balsiuose                      0.00
baltupiuose                    0.00
burbiskes                      0.00
fabijoniskese                  0.00
filaretuose                    0.00
grigiskese                     0.00
jeruzaleje                     0.00
justiniskese                   0.00
kalnenuose                     0.00
karoliniskese                  0.00
lazdyneliuose                  1.00
lazdynuose                     0.00
markuciuose                    0.00
naujamiestyje                  0.00
naujininkuose               