In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('output.csv')
df.describe()

Unnamed: 0,floor,total_floor_num,rent_price,room_num,area,year,year_renovation,floors_from_top,top_floor
count,2026.0,2025.0,2027.0,2025.0,2027.0,2027.0,76.0,2025.0,2027.0
mean,3.335143,5.78963,514.51258,2.034568,60.68594,1986.372472,2013.105263,2.45284,0.226443
std,2.217933,3.279803,248.492365,0.900587,250.692444,69.370985,7.808677,2.586999,0.418632
min,0.0,1.0,15.0,1.0,1.0,0.0,1965.0,0.0,0.0
25%,2.0,4.0,350.0,1.0,37.0,1972.5,2010.0,1.0,0.0
50%,3.0,5.0,460.0,2.0,50.0,1997.0,2016.0,2.0,0.0
75%,4.0,7.0,650.0,3.0,65.0,2010.0,2018.0,3.0,0.0
max,20.0,29.0,1480.0,18.0,8000.0,2019.0,2019.0,24.0,1.0


In [3]:
df.head()

Unnamed: 0,Adresas,floor,total_floor_num,Buto numeris:,rent_price,room_num,year_full,Namo numeris:,Nuoroda,Pastato tipas:,...,area,Redaguotas,Įdėtas,Įrengimas:,Šildymas:,year,year_renovation,floors_from_top,top_floor,district
0,https://www.aruodas.lt/butu-nuoma-vilniuje-pas...,3.0,9.0,,450,2.0,2007,,,Mūrinis,...,64.0,,,Įrengtas ...,Centrinis kolektorinis,2007,,6.0,0,pasilaiciuose
1,https://www.aruodas.lt/butu-nuoma-vilniuje-zve...,4.0,4.0,,699,2.0,2019,10.0,,Mūrinis,...,41.47,,,,"Centrinis, elektra",2019,,0.0,1,zveryne
2,https://www.aruodas.lt/butu-nuoma-vilniuje-sen...,3.0,3.0,,800,3.0,1940,,,Mūrinis,...,71.0,,,Įrengtas ...,Dujinis,1940,,0.0,1,senamiestyje
3,https://www.aruodas.lt/butu-nuoma-vilniuje-zve...,2.0,3.0,,400,2.0,1969,,,Mūrinis,...,41.84,,,Įrengtas ...,Centrinis,1969,,1.0,0,zveryne
4,https://www.aruodas.lt/butu-nuoma-vilniuje-laz...,2.0,5.0,,390,1.0,2008,,,Mūrinis,...,36.0,,,Įrengtas ...,Centrinis kolektorinis,2008,,3.0,0,lazdyneliuose


## encoder

In [4]:
# le = preprocessing.LabelEncoder()
# df["district"].fillna("None", inplace = True)
# df["district"] = le.fit_transform(df["district"])

In [5]:
# http://www.insightsbot.com/python-one-hot-encoding-with-scikit-learn/
df["district"].fillna("None", inplace = True)
district_ohe = OneHotEncoder()

X = district_ohe.fit_transform(df.district.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = ["district_"+str(int(i)) for i in range(X.shape[1])])
df = pd.concat([df, dfOneHot], axis=1)

## pipline and model

In [6]:
Pipeline = Pipeline(steps = [
    ("impute", SimpleImputer(missing_values=np.nan)), 
    ("rfr", RandomForestRegressor(random_state=42))
])

## train_test_split

In [7]:
df_train = df.select_dtypes(include=['float64','int64'])

df_features = df_train.drop(columns="rent_price")
df_target = df_train["rent_price"]

X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, random_state=42)

In [8]:

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'rfr__n_estimators': n_estimators,
               'rfr__max_features': max_features,
               'rfr__max_depth': max_depth,
               'rfr__min_samples_split': min_samples_split,
               'rfr__min_samples_leaf': min_samples_leaf,
               'rfr__bootstrap': bootstrap}

In [12]:
# https://www.programcreek.com/python/example/91146/sklearn.model_selection.RandomizedSearchCV
rf_random = RandomizedSearchCV(Pipeline, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('impute',
                                              SimpleImputer(add_indicator=False,
                                                            copy=True,
                                                            fill_value=None,
                                                            missing_values=nan,
                                                            strategy='mean',
                                                            verbose=0)),
                                             ('rfr',
                                              RandomForestRegressor(bootstrap=True,
                                                                    criterion='mse',
                                                                    max_depth=None,
                                                                    ma

In [14]:
# print("RSME:", mean_squared_error(y_test, gridsearch.predict(X_test)))
print("RSME:", int((mean_squared_error(y_test, rf_random.predict(X_test)))**0.5))

RSME: 113


In [None]:
pipe = Pipeline.fit(X_train, y_train)

print("RSME:", int((mean_squared_error(y_test, pipe.predict(X_test)))**0.5))