In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform, truncnorm, randint
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [37]:
df = pd.read_csv('output_excluded_senamiestyje.csv')
df.describe()

Unnamed: 0,floor,total_floor_num,rent_price,room_num,area,year,year_renovation,floors_from_top,top_floor
count,1610.0,1609.0,1611.0,1609.0,1611.0,1611.0,50.0,1609.0,1611.0
mean,3.477019,6.204475,464.403476,1.992542,60.845481,1991.458101,2014.68,2.725295,0.196151
std,2.31643,3.264246,219.548608,0.926366,280.952099,73.807505,4.537553,2.656376,0.397208
min,0.0,1.0,15.0,1.0,1.0,0.0,2001.0,0.0,0.0
25%,2.0,4.0,329.0,1.0,35.235,1980.0,2014.0,1.0,0.0
50%,3.0,5.0,430.0,2.0,49.0,2002.0,2016.0,2.0,0.0
75%,5.0,9.0,550.0,2.0,63.0,2011.0,2018.0,4.0,0.0
max,20.0,29.0,1480.0,18.0,8000.0,2019.0,2019.0,24.0,1.0


In [38]:
df.head()

Unnamed: 0,Adresas,floor,total_floor_num,Buto numeris:,rent_price,room_num,year_full,Namo numeris:,Nuoroda,Pastato tipas:,...,area,Redaguotas,Įdėtas,Įrengimas:,Šildymas:,year,year_renovation,floors_from_top,top_floor,district
0,https://www.aruodas.lt/butu-nuoma-vilniuje-pas...,3.0,9.0,,450,2.0,2007,,,Mūrinis,...,64.0,,,Įrengtas ...,Centrinis kolektorinis,2007,,6.0,0,pasilaiciuose
1,https://www.aruodas.lt/butu-nuoma-vilniuje-zve...,4.0,4.0,,699,2.0,2019,10.0,,Mūrinis,...,41.47,,,,"Centrinis, elektra",2019,,0.0,1,zveryne
2,https://www.aruodas.lt/butu-nuoma-vilniuje-zve...,2.0,3.0,,400,2.0,1969,,,Mūrinis,...,41.84,,,Įrengtas ...,Centrinis,1969,,1.0,0,zveryne
3,https://www.aruodas.lt/butu-nuoma-vilniuje-laz...,2.0,5.0,,390,1.0,2008,,,Mūrinis,...,36.0,,,Įrengtas ...,Centrinis kolektorinis,2008,,3.0,0,lazdyneliuose
4,https://www.aruodas.lt/butu-nuoma-vilniuje-laz...,2.0,5.0,,700,3.0,1998,,,Mūrinis,...,119.0,,,Įrengtas ...,Centrinis kolektorinis,1998,,3.0,0,lazdynuose


## encoder

In [39]:
# le = preprocessing.LabelEncoder()
# df["district"].fillna("None", inplace = True)
# df["district"] = le.fit_transform(df["district"])

In [40]:
# http://www.insightsbot.com/python-one-hot-encoding-with-scikit-learn/
df["district"].fillna("None", inplace = True)
district_ohe = OneHotEncoder()

X = district_ohe.fit_transform(df.district.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = ["district_"+str(int(i)) for i in range(X.shape[1])])
df = pd.concat([df, dfOneHot], axis=1)

## pipline and model

In [41]:
Pipeline_rfr = Pipeline(steps = [
    ("impute", SimpleImputer(missing_values=np.nan)), 
    ("rfr", RandomForestRegressor(random_state=42))
])

## train_test_split

In [42]:
df_train = df.select_dtypes(include=['float64','int64'])

df_features = df_train.drop(columns="rent_price")
df_target = df_train["rent_price"]

X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, random_state=42)

In [43]:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'rfr__n_estimators': n_estimators,
               'rfr__max_features': max_features,
               'rfr__max_depth': max_depth,
               'rfr__min_samples_split': min_samples_split,
               'rfr__min_samples_leaf': min_samples_leaf,
               'rfr__bootstrap': bootstrap}

rf_random = RandomizedSearchCV(Pipeline_rfr, param_distributions = random_grid, cv = 5, scoring="neg_mean_squared_error", n_iter=100)

In [12]:
rf_random.fit(X_train, y_train)
print(rf_random.best_estimator_)

KeyboardInterrupt: 

In [10]:
print("RSME:", int((mean_squared_error(y_test, rf_random.predict(X_test)))**0.5))

RSME: 98


In [24]:
print("RSME:", int((mean_squared_error(y_train, rf_random.predict(X_train)))**0.5))

RSME: 40


## Gradient Boosting regression

In [12]:
# Pipeline_gbrt = Pipeline(steps = [
#     ("impute", SimpleImputer(missing_values=np.nan)), 
#     ("gbrt", GradientBoostingRegressor(random_state=42))
# ])

In [13]:
# Pipeline_gbrt = Pipeline(steps = [
#     ("impute", SimpleImputer(missing_values=np.nan)), 
#     ("gbrt", GradientBoostingRegressor(random_state=42))
# ])

# params = {}

# gbrt_random = RandomizedSearchCV(Pipeline_gbrt,param_distributions = params, random_state=42)
# gbrt_random.fit(X_train, y_train)
# print("RSME:", int((mean_squared_error(y_test, gbrt_random.predict(X_test)))**0.5))

In [14]:
# print("RSME:", int((mean_squared_error(y_train, gbrt_random.predict(X_train)))**0.5))

## export

In [1]:
import joblib
# joblib.dump(rf_random, 'my_model_excluded_senamiestyje.pkl', compress=9)
model_clone = joblib.load('my_model_excluded_senamiestyje.pkl')

## feature_importances

In [49]:
feature_importances = model_clone.best_estimator_.named_steps['rfr'].feature_importances_
model = model_clone.best_estimator_.named_steps['rfr']
feature_names = X_train.columns

In [51]:
for importances, names in zip(feature_importances, feature_names):
    print(names, importances)

floor 0.04407009912029021
total_floor_num 0.05128753840901065
room_num 0.1638635671258875
area 0.33280292422440966
year 0.18706144219131252
year_renovation 0.010902497833878859
floors_from_top 0.03188398650091704
top_floor 0.00895726740357749
district_0 0.0027055679719479326
district_1 0.007170601433815839
district_2 5.6685242327655064e-05
district_3 0.0029841185968190334
district_4 0.0015683781658954001
district_5 0.00016986474683025618
district_6 0.00388717011105483
district_7 0.00036363094243597766
district_8 0.003013032746080555
district_9 0.0006127036022129671
district_10 0.00048405376347218984
district_11 0.0011319803871552036
district_12 0.0035630733538246184
district_13 0.0002339238046712222
district_14 0.002483066815783668
district_15 0.0008092792148871994
district_16 0.0031007947562752343
district_17 0.0008529865241787478
district_18 0.03542451866143863
district_19 0.002440085318951727
district_20 0.004267789517637558
district_21 0.008248974704125688
district_22 0.00107405488

In [58]:
i = 0
for district in district_ohe.get_feature_names():
    print(district, i)
    i += 1

x0_None 0
x0_antakalnyje 1
x0_aukstuosiuose-paneriuose 2
x0_avizieniuose 3
x0_bajoruose 4
x0_balsiuose 5
x0_baltupiuose 6
x0_burbiskes 7
x0_fabijoniskese 8
x0_filaretuose 9
x0_grigiskese 10
x0_jeruzaleje 11
x0_justiniskese 12
x0_kalnenuose 13
x0_karoliniskese 14
x0_lazdyneliuose 15
x0_lazdynuose 16
x0_markuciuose 17
x0_naujamiestyje 18
x0_naujininkuose 19
x0_naujojoje-vilnioje 20
x0_pasilaiciuose 21
x0_pavilnyje 22
x0_pilaiteje 23
x0_santariskese 24
x0_seskineje 25
x0_siaures-miestelyje 26
x0_snipiskese 27
x0_tarandeje 28
x0_uzupyje 29
x0_valakampiuose 30
x0_vilkpedeje 31
x0_virsuliskese 32
x0_visoriuose 33
x0_zemuosiuose-paneriuose 34
x0_zirmunuose 35
x0_zveryne 36
