In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import uniform, truncnorm, randint
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('output.csv')
df.describe()

Unnamed: 0,floor,total_floor_num,rent_price,room_num,area,year,year_renovation,top_floor,oldtown
count,2302.0,2302.0,2302.0,2302.0,2302.0,2302.0,80.0,2302.0,2302.0
mean,3.377932,5.809296,523.900956,2.02563,53.301108,1990.491746,2013.175,0.216334,0.205039
std,2.249265,3.226175,235.495612,0.789294,21.369041,25.717765,7.577072,0.411834,0.403818
min,0.0,1.0,155.0,1.0,16.0,1903.0,1965.0,0.0,0.0
25%,2.0,4.0,360.0,1.0,38.0,1975.0,2011.75,0.0,0.0
50%,3.0,5.0,460.0,2.0,50.0,2000.5,2016.0,0.0,0.0
75%,4.0,7.0,650.0,2.0,65.0,2011.0,2017.25,0.0,0.0
max,20.0,29.0,1480.0,6.0,150.0,2019.0,2019.0,1.0,1.0


In [3]:
df.head()

Unnamed: 0,Adresas,floor,total_floor_num,rent_price,room_num,year_full,house_type,area,heating,year,year_renovation,top_floor,oldtown,district
0,https://www.aruodas.lt/butu-nuoma-vilniuje-sen...,4.0,6.0,890,3.0,2017,Monolitinis,76.0,Centrinis kolektorinis,2017,,0,1.0,senamiestyje
1,https://www.aruodas.lt/butu-nuoma-vilniuje-laz...,2.0,4.0,440,2.0,2008,Mūrinis,68.0,Centrinis kolektorinis,2008,,0,0.0,lazdyneliuose
2,https://www.aruodas.lt/butu-nuoma-vilniuje-sen...,2.0,3.0,811,3.0,1940,Mūrinis,64.0,Dujinis,1940,,0,1.0,senamiestyje
3,https://www.aruodas.lt/butu-nuoma-vilniuje-jus...,4.0,5.0,310,2.0,1985,Blokinis,50.0,Centrinis,1985,,0,0.0,justiniskese
4,https://www.aruodas.lt/butu-nuoma-vilniuje-nau...,4.0,9.0,380,2.0,2017,Mūrinis,34.0,Centrinis kolektorinis,2017,,0,0.0,naujamiestyje


## encoder one-hot

In [4]:
# http://www.insightsbot.com/python-one-hot-encoding-with-scikit-learn/
df["district"].fillna("None", inplace = True)
district_ohe = OneHotEncoder()

X = district_ohe.fit_transform(df.district.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(X, columns = ["district_"+str(int(i)) for i in range(X.shape[1])])
df = pd.concat([df, dfOneHot], axis=1)

In [5]:
# df["heating"].fillna("None", inplace = True)
# Heating_ohe = OneHotEncoder()

# X = Heating_ohe.fit_transform(df.heating.values.reshape(-1,1)).toarray()
# dfOneHot = pd.DataFrame(X, columns = ["heating_"+str(int(i)) for i in range(X.shape[1])])
# df = pd.concat([df, dfOneHot], axis=1)

In [6]:
# df["house_type"].fillna("None", inplace = True)
# House_type_ohe = OneHotEncoder()

# X = House_type_ohe.fit_transform(df.house_type.values.reshape(-1,1)).toarray()
# dfOneHot = pd.DataFrame(X, columns = ["House_type_"+str(int(i)) for i in range(X.shape[1])])
# df = pd.concat([df, dfOneHot], axis=1)

## encoder OrdinalEncoder

In [7]:
# enc = OrdinalEncoder()
# features_to_encode = ["heating", "house_type", "district"]
# def encoderOrdinalEncoder(feature):
#     df[feature] = enc.fit_transform(df[feature].values.reshape(-1, 1))

# for feature in features_to_encode:
#     encoderOrdinalEncoder(feature)

# !!worse RMSE and MAE results compared with one-hot

## pipline and model

In [8]:
Pipeline_rfr = Pipeline(steps = [
    ("impute", SimpleImputer(missing_values=np.nan)), 
    ("rfr", RandomForestRegressor(random_state=42))
])

## train_test_split

In [9]:
df_train = df.select_dtypes(include=['float64','int64'])

df_features = df_train.drop(columns="rent_price")
df_target = df_train["rent_price"]

X_train, X_test, y_train, y_test = train_test_split(df_features, df_target, random_state=42)

In [10]:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'rfr__n_estimators': n_estimators,
               'rfr__max_features': max_features,
               'rfr__max_depth': max_depth,
               'rfr__min_samples_split': min_samples_split,
               'rfr__min_samples_leaf': min_samples_leaf,
               'rfr__bootstrap': bootstrap}


# max_features = ['auto', 'sqrt']
# random_grid = {'rfr__max_features': max_features}


rf_random = RandomizedSearchCV(Pipeline_rfr, param_distributions = random_grid, cv = 5, scoring="neg_mean_squared_error", n_iter=100)

In [11]:
rf_random.fit(X_train, y_train)
rf_random.best_estimator_

Pipeline(memory=None,
         steps=[('impute',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('rfr',
                 RandomForestRegressor(bootstrap=False, criterion='mse',
                                       max_depth=100, max_features='sqrt',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=5,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=800, n_jobs=None,
                                       oob_score=False, random_state=42,
                                       verbose=0, warm_start=False))],
         verbose=False)

In [12]:
print("RSME:", int((mean_squared_error(y_test, rf_random.predict(X_test)))**0.5))
print("MAE:", int((mean_absolute_error(y_test, rf_random.predict(X_test)))))

RSME: 88
MAE: 60


In [13]:
print("RSME:", int((mean_squared_error(y_train, rf_random.predict(X_train)))**0.5))
print("MAE:", int((mean_absolute_error(y_train, rf_random.predict(X_train)))))

RSME: 39
MAE: 25


In [14]:
feature_importances = rf_random.best_estimator_.named_steps['rfr'].feature_importances_
model = rf_random.best_estimator_.named_steps['rfr']
feature_names = X_train.columns

In [15]:
for importances, names in zip(feature_importances, feature_names):
    print(names,";", importances)

floor ; 0.035966042220481644
total_floor_num ; 0.05427207875595878
room_num ; 0.1718692017949321
area ; 0.311622626942823
year ; 0.16422311169062845
year_renovation ; 0.005336215928159226
top_floor ; 0.007978333407415217
oldtown ; 0.07436703715702726
district_0 ; 0.0009752415429052422
district_1 ; 0.003561312160195651
district_2 ; 0.00011643477209739218
district_3 ; 0.0010718198560734046
district_4 ; 0.001170281394150189
district_5 ; 0.00014891848744267482
district_6 ; 0.0031717894966386932
district_7 ; 4.8900153534790285e-05
district_8 ; 0.0029627723671588474
district_9 ; 0.00033269846281714944
district_10 ; 0.00010853925828884863
district_11 ; 0.0007172881979717932
district_12 ; 0.003538276836533443
district_13 ; 0.0001473479439387335
district_14 ; 0.001946633324513017
district_15 ; 0.0006582357037957257
district_16 ; 0.0021758281926523367
district_17 ; 0.0005284859668788197
district_18 ; 0.018639327506480077
district_19 ; 0.0023361252947159293
district_20 ; 0.0031639691160656423
dis

In [16]:
district_ohe.get_feature_names()

array(['x0_0', 'x0_antakalnyje', 'x0_aukstuosiuose-paneriuose',
       'x0_avizieniuose', 'x0_bajoruose', 'x0_balsiuose',
       'x0_baltupiuose', 'x0_burbiskes', 'x0_fabijoniskese',
       'x0_filaretuose', 'x0_grigiskese', 'x0_jeruzaleje',
       'x0_justiniskese', 'x0_kalnenuose', 'x0_karoliniskese',
       'x0_lazdyneliuose', 'x0_lazdynuose', 'x0_markuciuose',
       'x0_naujamiestyje', 'x0_naujininkuose', 'x0_naujojoje-vilnioje',
       'x0_pasilaiciuose', 'x0_pavilnyje', 'x0_pilaiteje',
       'x0_santariskese', 'x0_sauletekyje', 'x0_senamiestyje',
       'x0_seskineje', 'x0_siaures-miestelyje', 'x0_snipiskese',
       'x0_tarandeje', 'x0_traku', 'x0_uzupyje', 'x0_valakampiuose',
       'x0_verkiuose', 'x0_vilkpedeje', 'x0_virsuliskese',
       'x0_visoriuose', 'x0_zemuosiuose-paneriuose', 'x0_zirmunuose',
       'x0_zveryne'], dtype=object)

## Gradient Boosting regression

In [32]:
Pipeline_gbrt = Pipeline(steps = [
    ("impute", SimpleImputer(missing_values=np.nan)), 
    ("gbrt", GradientBoostingRegressor(random_state=42))
])

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
learn_rates = [0.02, 0.05, 0.06]
max_depths = [int(x) for x in np.linspace(10, 150, num = 11)]
max_depths.append(None)
min_samples_leaf = [5,10,15,25,50]
min_samples_split = [5,10,25,50]

params = {'gbrt__n_estimators': n_estimators,
              'gbrt__learning_rate': learn_rates,
              'gbrt__max_depth': max_depths,
              'gbrt__min_samples_leaf': min_samples_leaf,
              'gbrt__min_samples_split': min_samples_split}

gbrt_random = RandomizedSearchCV(Pipeline_gbrt,param_distributions = params, random_state=42)
gbrt_random.fit(X_train, y_train)
gbrt_random.best_estimator_

Pipeline(memory=None,
         steps=[('impute',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('gbrt',
                 GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse',
                                           init=None, learning_rate=0.02,
                                           loss='ls', max_depth=66,
                                           max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=10,
                                           min_samples_split=25,
                                           min_weight_fraction_leaf=0.0,
                                           n_e

In [33]:
print("RSME:", int((mean_squared_error(y_test, gbrt_random.predict(X_test)))**0.5))
print("MAE:", int((mean_absolute_error(y_test, gbrt_random.predict(X_test)))))

RSME: 97
MAE: 68


## export

In [24]:
import joblib
joblib.dump(rf_random, 'RFR 88.pkl', compress=9)
# model = joblib.load('RFR 88.pkl')
# X_train

['RFR 88.pkl']

In [25]:
# predicted_prices = model.predict(df_features)

# my_submission = pd.DataFrame({'area': df_features.area, 'predicted_prices': predicted_prices})
# my_submission.to_csv('predicted_prices.csv', index=False)

In [None]:
# df_features.to_csv('full.csv', index=False)
# df_target.to_csv('full target.csv', index=False)
# df.to_csv('df.csv', index=False)