# Main Notebook

In [80]:
import os
import gc
import pytz
import operator
import numpy as np
import pickle as pkl
import matplotlib as mtp
from time import sleep
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from keras.wrappers.scikit_learn import KerasRegressor
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import finn_data
import tensorflow as tf
from tensorflow import math
import keras
from keras import layers
from keras.models import Sequential
from keras.activations import relu, elu
from keras.layers import Dense, Dropout
from talos.model import early_stopper
from talos.utils.best_model import activate_model
from talos import Evaluate
import talos
from talos import Reporting
from talos import Deploy
from talos import Restore
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import finn_data
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action = 'ignore', category = FutureWarning)
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import sys
sys.path.append('..')

time = datetime.now(pytz.timezone('Europe/Oslo')).strftime('%m.%d.%Y_%H.%M.%S')
print(f'Notebook initialized execution at {time}.')
#import xgboost as xgb

Notebook initialized execution at 04.15.2020_12.36.55.


## General Methods

In [2]:
def memory_optimization(dfs):
    for df in dfs:
        del df
    gc.collect()

## Prepare Data

In [110]:
import datasets

start_time = datetime.now()
scaler = MinMaxScaler()
train_x, train_y, validation_x, validation_y, test_x, test_y, scaler = datasets.load(f'../input/hele_norge.csv', scaler)

cleaning removed 1.06 % of the original values


# Optimization Parameters

In [4]:
parameters = {'activation_1':['relu', 'elu'],
     'activation_2':['relu', 'elu'],
     'activation_3':['relu', 'elu'],
     'optimizer': ['Adam', "RMSprop"],
     'loss-functions': ['mse'],
     'neurons_HL1': [50, 100, 200, 400],
     'neurons_HL2': [40, 80, 160, 320],
     'neurons_HL3': [40, 80, 160, 320, None],
     'dropout1': [0.1, 0.2, 0.3],
     'dropout2': [0.1, 0.2, 0.3],
     'batch_size': [100, 250, 500],
     'epochs': [400, 900]
}

In [5]:
def talolos(x_train, y_train, x_val, y_val, parameters):
    model = Sequential()

    model.add(Dense(parameters['neurons_HL1'], 
    input_shape=(train_x.shape[1],), 
    activation=parameters['activation_1'],use_bias=True))

    model.add(Dropout(parameters['dropout1']))

    model.add(Dense(parameters['neurons_HL2'], 
    activation=parameters['activation_2'], use_bias=True))

    model.add(Dropout(parameters['dropout1']))
    
    if parameters['neurons_HL3']:
        model.add(Dense(parameters['neurons_HL3'], 
        activation=parameters['activation_3'], use_bias=True))


    model.add(Dense(1, activation='relu'))

    model.compile(optimizer=parameters['optimizer'], loss=parameters['loss-functions'], 
    metrics=['mse', 'mae'])

    history = model.fit(x_train, y_train,
            batch_size=parameters['batch_size'],epochs=parameters['epochs'],
            verbose=0,validation_data=[x_val, y_val],
            callbacks=[early_stopper(epochs=parameters['epochs'], 
            mode='moderate',monitor='val_loss', patience=25)])
    
    return history, model

## Specify model

In [6]:
t = talos.Scan(x=np.array(train_x),
               y=np.array(train_y),
               x_val=np.array(validation_x),
               y_val=np.array(validation_y),
               model=talolos,
               params=parameters,
               experiment_name='oloo',
               round_limit=50)

  0%|          | 0/50 [00:00<?, ?it/s]




 20%|██        | 10/50 [24:39<1:27:26, 131.17s/it]

KeyboardInterrupt: 

In [127]:
def inverse_transform(scaler, value):
    mat = np.zeros((1, scaler.scale_.shape[0]))
    mat[0, 0] = value
    return scaler.inverse_transform(mat)[:,0]

## Evaluate


In [None]:
def evaluate(scan_model, test_x, test_y):
    eval_model = Evaluate(scan_model)
    results = eval_model.evaluate(np.array(test_x), np.array(test_y), task='continuous',folds=10, metric='loss')
    return np.array([inverse_transform(scaler,result) for result in results])


## Results

In [None]:
#t.data
results = evaluate(t, test_x, test_y)
print(results)

## Retrieve old parameters

In [None]:
res = Reporting('../input/2000_kjoring.csv')

In [None]:
best = res.data.sort_values('val_mae', ascending=True).iloc[0]
best = pd.DataFrame(best)


In [None]:
def extract_param(df, key):
    return df.loc[key].values[0]

In [None]:
parameters = {'activation_1': extract_param(best,'activation_1'),
 'activation_2': extract_param(best,'activation_2'),
 'activation_3': extract_param(best,'activation_3'),
 'optimizer': extract_param(best,'optimizer'),
 'loss-functions': extract_param(best,'loss-functions'),
 'neurons_HL1': extract_param(best,'neurons_HL1'),
 'neurons_HL2': extract_param(best,'neurons_HL2'),
 'neurons_HL3': extract_param(best,'neurons_HL3'),
 'dropout1': extract_param(best,'dropout1'),
 'dropout2': extract_param(best,'dropout2'),
 'batch_size': extract_param(best,'batch_size'),
 'epochs': extract_param(best,'epochs')}
def best_model():
    parameters = {'activation_1': extract_param(best,'activation_1'),
     'activation_2': extract_param(best,'activation_2'),
     'activation_3': extract_param(best,'activation_3'),
     'optimizer': extract_param(best,'optimizer'),
     'loss-functions': extract_param(best,'loss-functions'),
     'neurons_HL1': extract_param(best,'neurons_HL1'),
     'neurons_HL2': extract_param(best,'neurons_HL2'),
     'neurons_HL3': extract_param(best,'neurons_HL3'),
     'dropout1': extract_param(best,'dropout1'),
     'dropout2': extract_param(best,'dropout2'),
     'batch_size': extract_param(best,'batch_size'),
     'epochs': extract_param(best,'epochs')
    }
    model = Sequential()

    model.add(Dense(parameters['neurons_HL1'], 
    input_shape=(train_x.shape[1],), 
    activation=parameters['activation_1'],use_bias=True))

    model.add(Dropout(parameters['dropout1']))

    model.add(Dense(parameters['neurons_HL2'], 
    activation=parameters['activation_2'], use_bias=True))

    model.add(Dropout(parameters['dropout1']))
    
    if parameters['neurons_HL3'] != 'None':
        model.add(Dense(parameters['neurons_HL3'], 
        activation=parameters['activation_3'], use_bias=True))


    model.add(Dense(1, activation='relu'))

    model.compile(optimizer=parameters['optimizer'], loss=parameters['loss-functions'], 
    metrics=['mse', 'mae'])
    
    return model

In [None]:
best_model()

## Train retrieved model

In [None]:
my_model = KerasRegressor(build_fn=best_model,epochs=300, 
    batch_size=parameters['batch_size'], verbose=2)    
my_model.fit(train_x, train_y)

## Find highest contributing features

In [None]:
from eli5.sklearn import PermutationImportance
import eli5

perm = PermutationImportance(my_model, random_state=1).fit(train_x,train_y)
weight_df = eli5.explain_weights_df(perm, feature_names = train_x.columns.tolist(), )
weight_df.nlargest(10,['weight'])['feature']

In [27]:
weight_df.nlargest(10,['weight'])['feature'].iloc[5]

'boligtype_Rekkehus'

## Evaluate

In [71]:
predictions = pd.DataFrame()
predictions['pred'] = my_model.predict(test_x)
predictions

Unnamed: 0,pred
0,0.161780
1,0.122347
2,0.144134
3,0.173612
4,0.097343
...,...
2285,0.027951
2286,0.045389
2287,0.050174
2288,0.269568


In [72]:
pd.DataFrame(test_y)

Unnamed: 0,totalpris
13745,0.110419
5325,0.092896
5953,0.137189
13204,0.146693
854,0.062473
...,...
13959,0.025329
1779,0.059920
3727,0.045829
4276,0.320869


In [77]:
def evaluate_prediction(predictions, test_y): 
    """ prints the importances of features 
    Args:
        predictions: predictions for tast set
        test_y:      test set labels
    Returns:
    """
    test_evaluation = predictions
    test_evaluation['benchmark'] = test_y.mean()
    test_evaluation['target'] = test_y.reset_index(drop=True)
    test_evaluation['difference'] = test_evaluation['pred'] - test_evaluation['target']
    test_evaluation['bench difference'] = test_evaluation['benchmark'] - test_evaluation['target']
    test_evaluation['abs difference'] = abs(test_evaluation['difference'])
    test_evaluation['abs bench difference'] = abs(test_evaluation['bench difference'])
    test_evaluation['difference %'] = (test_evaluation['pred'] / test_evaluation['target'] - 1) * 100
    test_evaluation['bench difference %'] = abs((test_evaluation['benchmark'] / test_evaluation['target'] - 1) * 100)
    
    mean = int(test_evaluation['abs difference'].mean())
    bench_mean = int(test_evaluation['abs bench difference'].mean())
    mean_perc = round(abs(test_evaluation['difference %']).mean(), 2)
    bench_mean_perc = round(abs(test_evaluation['bench difference %']).mean(), 2)
    print('Model evaluation campared to mean benchmark:', test_y.median())
    print(f'| mean abs.  difference | our model: {mean}  benchmark: {bench_mean}')
    print(f'| mean abs % difference | our model: {mean_perc} %  benchmark: {bench_mean_perc} %')
    print()
    
    return


In [None]:
evaluate_prediction(pd.DataFrame(predictions), pd.DataFrame(test_y))

Unnamed: 0,soverom,primaerrom
5771,0.3,0.000568
7485,0.4,0.000837
4001,0.3,0.000646
13377,0.3,0.000479
1059,0.3,0.000700
...,...,...
13261,0.2,0.000550
3296,0.2,0.000413
9952,0.3,0.000658
10915,0.2,0.000389


In [None]:
from matplotlib import pyplot
pd_y = pd.Series(train_y, predictions['pred'])

pyplot.hist(pd_y, bins=50, rwidth=0.9,alpha=0.5,
             label='totalpris')
pyplot.hist(predictions['pred'], bins=50, rwidth=0.9,alpha=0.5,
                     label='prediksjon')
pyplot.show()



In [None]:
price_pred = [inverse_transform(scaler, prediction) for prediction in predictions['pred']]
price_tot = [inverse_transform(scaler, price) for price in test_y]

In [None]:
pd.Series(price_pred)
#pyplot.hist(pd.Series(price_tot), bins=50,alpha=0.5,
#             label='totalpris')
#pyplot.hist(pd.Series(price_pred), bins=50, rwidth=0.9,alpha=0.5,
 #                    label='prediksjon')
#pyplot.show()