In [368]:
import pandas as pd
import keras
import tensorflow as tf
from keras import layers
from sklearn.model_selection import train_test_split
import time
import plotly.graph_objects as go 
import plotly.express as px
import keras_tuner
from keras import optimizers
from keras.callbacks import EarlyStopping
import numpy as np

df_list = pd.read_csv('df_with_dummies.csv', index_col=0)

df_compare = pd.DataFrame(columns=['Nazwa', 'Czas', 'F. Straty', 'RMSE'])

In [369]:
df_list.shape

(26915, 429)

In [370]:
df_list.head(5)

Unnamed: 0,accommodates,bedrooms,beds,price,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,...,property_type_serviced apartment,property_type_tent,property_type_tower,property_type_townhouse,property_type_vacation home,property_type_villa,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,2,1.0,1.0,299.0,1,21,51,81,356,9,...,0,0,0,0,0,0,0,0,1,0
4,4,2.0,2.0,275.0,1,0,0,0,179,3,...,0,0,0,0,0,0,1,0,0,0
6,2,1.0,1.0,308.0,1,15,42,72,217,42,...,0,0,0,0,0,0,0,1,0,0
7,2,1.0,1.0,68.0,1,2,4,7,160,550,...,0,0,0,0,0,0,0,0,1,0
8,2,1.0,2.0,55.0,1,0,0,0,132,207,...,0,0,0,0,0,0,0,0,1,0


In [371]:
df_list_Y = df_list['price']

In [372]:
df_list = df_list.drop(columns=['price'])
df_list = df_list.select_dtypes(['number'])

In [373]:
X_train, X_test, y_train, y_test = train_test_split(df_list, df_list_Y, test_size=0.3, random_state=42)

In [374]:
val_amount = int(df_list.shape[0]*0.1)
val_amount

2691

In [375]:
X_train.shape

(18840, 428)

In [376]:
#df_shape
val_X = X_train[0:val_amount]
val_y = y_train.iloc[0:val_amount]
X_train_part = X_train[val_amount:-1]
y_train_part = y_train.iloc[val_amount:-1]

In [377]:
from numpy import arange
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from math import sqrt

model = Lasso(alpha=0.01)

start_time = time.time()
results = model.fit(X_train, y_train)
how_long = time.time() - start_time

y_test_preds = model.predict(X_test)
mse = mean_squared_error(y_test, y_test_preds)

rmse = sqrt(mse)
result = [mse, rmse]
print(f'RMSE {rmse}')

RMSE 80.09332196918704


In [378]:
list_with_results = ['Lasso', how_long, result[0], result[1]]
df_compare = df_compare.append(pd.DataFrame([list_with_results], columns=['Nazwa', 'Czas', 'F. Straty', 'RMSE']), ignore_index=True)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [379]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

forest = RandomForestRegressor(n_estimators=100, 
                               criterion='squared_error', 
                               random_state=42, 
                               n_jobs=-1,
                               max_depth=50)

start_time = time.time()                       
forest.fit(X_train, y_train)
how_long = time.time() - start_time

y_test_preds = forest.predict(X_test)
mse = mean_squared_error(y_test, y_test_preds)

rmse = sqrt(mse)
result = [mse, rmse]

print(f'RMSE {rmse}')

RMSE 79.61764044480383


In [400]:
sort = forest.feature_importances_.argsort()
fig = px.bar(y=df_list.columns[sort], x=forest.feature_importances_[sort], height=700, title='Wykres miary ważności atrybutów')
fig.update_layout(xaxis_title='wartość',yaxis_title='atrybuty')
fig.show()

In [381]:
list_with_results = ['Las losowy', how_long, result[0], result[1]]
df_compare = df_compare.append(pd.DataFrame([list_with_results], columns=['Nazwa', 'Czas', 'F. Straty', 'RMSE']), ignore_index=True)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [382]:
callback = EarlyStopping(monitor='val_loss', patience= 3)

In [383]:
def build_model(hp):
    model = keras.Sequential()
    for i in range(hp.Int('num_layers', 2,6)):
        model.add(layers.Dense(units=hp.Int('units_' + str(i),
                                            min_value=32,
                                            max_value=512,
                                            step=32),
                               activation='relu'))
        if hp.Boolean('dropout_' + str(i)):
            model.add(layers.Dropout(rate=0.25))
    model.add(layers.Dense(1))
    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    model.compile(
        optimizer=optimizers.RMSprop(learning_rate=learning_rate),
        loss="mse",
        metrics=[keras.metrics.RootMeanSquaredError()]
    )
    return model

In [384]:
tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective="val_loss",
    max_trials=10,
    executions_per_trial=2,
    directory="my_dir",
    project_name="WUM",
)

tuner.search_space_summary()

INFO:tensorflow:Reloading Oracle from existing project my_dir\WUM\oracle.json
INFO:tensorflow:Reloading Tuner from my_dir\WUM\tuner0.json
Search space summary
Default search space size: 14
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 6, 'step': 1, 'sampling': None}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': None}
dropout_0 (Boolean)
{'default': False, 'conditions': []}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': None}
dropout_1 (Boolean)
{'default': False, 'conditions': []}
lr (Float)
{'default': 0.0001, 'conditions': [], 'min_value': 0.0001, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
units_2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': None}
dropout_2 (Boolean)
{'default': False, 'conditions': []}
units_3 (Int)
{'default': None, 'conditions': [], 'min_valu

In [385]:
df_list.shape[1]

428

In [386]:
models = tuner.get_best_models(num_models=1)
best_model = models[0]
best_model.build(input_shape=(None, df_list.shape[1]))



In [387]:
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 224)               96096     
                                                                 
 dense_1 (Dense)             (None, 64)                14400     
                                                                 
 dense_2 (Dense)             (None, 96)                6240      
                                                                 
 dense_3 (Dense)             (None, 1)                 97        
                                                                 
Total params: 116,833
Trainable params: 116,833
Non-trainable params: 0
_________________________________________________________________


In [388]:
model = keras.models.clone_model(best_model, input_tensors=None, clone_function=None)
model.compile(
        optimizer='rmsprop',
        loss="mse",
        metrics=[keras.metrics.RootMeanSquaredError()]
    )
history = model.fit(X_train_part, y_train_part, epochs = 20,  validation_data= (val_X, val_y), callbacks=[callback])
history = model.history

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [389]:
model = keras.models.clone_model(best_model, input_tensors=None, clone_function=None)
model.compile(
        optimizer='rmsprop',
        loss="mse",
        metrics=[keras.metrics.RootMeanSquaredError()]
    )


start_time = time.time()  
history = model.fit(X_train, y_train, epochs = len(history.history['val_loss']))
how_long = time.time() - start_time

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [390]:
result = model.evaluate(X_test,y_test)



In [391]:
list_with_results = ['Sieć neuronowa', how_long, result[0], result[1]]
df_compare = df_compare.append(pd.DataFrame([list_with_results], columns=['Nazwa', 'Czas', 'F. Straty', 'RMSE']), ignore_index=True)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [392]:
predict = model.predict(X_test)



In [393]:
predict = [item for sublist in predict for item in sublist]
df_result = pd.DataFrame({'predict':predict, 'y':y_test.tolist()})
df_result = df_result.sort_values(by=['y'])

In [394]:
fig = go.Figure()

fig.add_scatter(y = df_result['predict'], name='Wartość predykcji')
fig.add_scatter(y = df_result['y'], name='Prawdziwa wartość')
fig.update_layout(
    title='Wykres porównujący prawdziwe wyniki z wartością predykcji',
    xaxis_title='Obiekt',
    yaxis_title='Wartość [$]'
)
fig.show()

In [395]:
start_time = time.time()  
y_test_mean = y_test.mean()
how_long = time.time() - start_time

y_list_mean = np.full(shape=y_test.shape[0], fill_value=y_test_mean, dtype=np.float32)
mse = mean_squared_error(y_test, y_list_mean)

rmse = sqrt(mse)
result = [mse, rmse]

list_with_results = ['Średnia', how_long, result[0], result[1]]
df_compare = df_compare.append(pd.DataFrame([list_with_results], columns=['Nazwa', 'Czas', 'F. Straty', 'RMSE']), ignore_index=True)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [396]:
fig = px.bar(df_compare, x=df_compare["Nazwa"],y=df_compare["Czas"], title='Wykres zależności czasu od modelu', labels={'Czas':'Czas [s]','Nazwa':'Model'})
fig.show()

In [402]:
fig = px.bar(df_compare, x=df_compare["Nazwa"],y=df_compare["RMSE"], height=700, title='Wykres zależności RMSE od modelu', labels={'y':'Wartość RMSE [%]','Nazwa':'Model'})
fig.show()

In [398]:
df_compare

Unnamed: 0,Nazwa,Czas,F. Straty,RMSE
0,Lasso,5.349397,6414.940224,80.093322
1,Las losowy,37.558394,6338.96867,79.61764
2,Sieć neuronowa,36.048809,6186.962402,78.657242
3,Średnia,0.0,16977.47364,130.297635
