# Main Notebook

In [1]:
import os
import gc
import pytz
import operator
import numpy as np
import pickle as pkl
import matplotlib as mtp
from time import sleep
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from keras.wrappers.scikit_learn import KerasRegressor
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import finn_data
import tensorflow as tf
from tensorflow import math
import keras
from keras import layers
from keras.models import Sequential
from keras.activations import relu, elu
from keras.layers import Dense, Dropout
from talos.model import early_stopper
from talos.utils.best_model import activate_model
from talos import Evaluate
import talos
from talos import Reporting
from talos import Deploy
from talos import Restore
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import finn_data
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action = 'ignore', category = FutureWarning)
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import sys
sys.path.append('..')

time = datetime.now(pytz.timezone('Europe/Oslo')).strftime('%m.%d.%Y_%H.%M.%S')
print(f'Notebook initialized execution at {time}.')
#import xgboost as xgb

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Notebook initialized execution at 04.15.2020_13.55.47.


## General Methods

In [2]:
def memory_optimization(dfs):
    for df in dfs:
        del df
    gc.collect()

## Prepare Data

In [3]:
import datasets

start_time = datetime.now()
scaler = MinMaxScaler()
train_x, train_y, validation_x, validation_y, test_x, test_y, scaler = datasets.load(f'../input/hele_norge.csv', scaler)

cleaning removed 1.06 % of the original values


# Optimization Parameters

In [4]:
parameters = {'activation_1':['relu', 'elu'],
     'activation_2':['relu', 'elu'],
     'activation_3':['relu', 'elu'],
     'optimizer': ['Adam', "RMSprop"],
     'loss-functions': ['mse'],
     'neurons_HL1': [50, 100, 200, 400],
     'neurons_HL2': [40, 80, 160, 320],
     'neurons_HL3': [40, 80, 160, 320, None],
     'dropout1': [0.1, 0.2, 0.3],
     'dropout2': [0.1, 0.2, 0.3],
     'batch_size': [100, 250, 500],
     'epochs': [400, 900]
}

In [5]:
def talolos(x_train, y_train, x_val, y_val, parameters):
    model = Sequential()

    model.add(Dense(parameters['neurons_HL1'], 
    input_shape=(train_x.shape[1],), 
    activation=parameters['activation_1'],use_bias=True))

    model.add(Dropout(parameters['dropout1']))

    model.add(Dense(parameters['neurons_HL2'], 
    activation=parameters['activation_2'], use_bias=True))

    model.add(Dropout(parameters['dropout1']))
    
    if parameters['neurons_HL3']:
        model.add(Dense(parameters['neurons_HL3'], 
        activation=parameters['activation_3'], use_bias=True))


    model.add(Dense(1, activation='relu'))

    model.compile(optimizer=parameters['optimizer'], loss=parameters['loss-functions'], 
    metrics=['mse', 'mae'])

    history = model.fit(x_train, y_train,
            batch_size=parameters['batch_size'],epochs=parameters['epochs'],
            verbose=0,validation_data=[x_val, y_val],
            callbacks=[early_stopper(epochs=parameters['epochs'], 
            mode='moderate',monitor='val_loss', patience=25)])
    
    return history, model

## Specify model

In [6]:
#t = talos.Scan(x=np.array(train_x),
#               y=np.array(train_y),
#               x_val=np.array(validation_x),
#               y_val=np.array(validation_y),
#               model=talolos,
#               params=parameters,
#               experiment_name='oloo',
#             round_limit=50)

  0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def inverse_transform(scaler, value):
    mat = np.zeros((1, scaler.scale_.shape[0]))
    mat[0, 0] = value
    return scaler.inverse_transform(mat)[:,0]

## Evaluate


In [None]:
def evaluate(scan_model, test_x, test_y):
    eval_model = Evaluate(scan_model)
    results = eval_model.evaluate(np.array(test_x), np.array(test_y), task='continuous',folds=10, metric='loss')
    return np.array([inverse_transform(scaler,result) for result in results])


## Results

In [None]:
#t.data
results = evaluate(t, test_x, test_y)
print(results)

## Retrieve old parameters

In [7]:
#Ligger på drive:
res = Reporting('../input/2000_kjoring.csv')

In [8]:
best = res.data.sort_values('val_mae', ascending=True).iloc[0]
best = pd.DataFrame(best)


In [9]:
def extract_param(df, key):
    return df.loc[key].values[0]

In [10]:
parameters = {'activation_1': extract_param(best,'activation_1'),
 'activation_2': extract_param(best,'activation_2'),
 'activation_3': extract_param(best,'activation_3'),
 'optimizer': extract_param(best,'optimizer'),
 'loss-functions': extract_param(best,'loss-functions'),
 'neurons_HL1': extract_param(best,'neurons_HL1'),
 'neurons_HL2': extract_param(best,'neurons_HL2'),
 'neurons_HL3': extract_param(best,'neurons_HL3'),
 'dropout1': extract_param(best,'dropout1'),
 'dropout2': extract_param(best,'dropout2'),
 'batch_size': extract_param(best,'batch_size'),
 'epochs': extract_param(best,'epochs')}
def best_model():
    parameters = {'activation_1': extract_param(best,'activation_1'),
     'activation_2': extract_param(best,'activation_2'),
     'activation_3': extract_param(best,'activation_3'),
     'optimizer': extract_param(best,'optimizer'),
     'loss-functions': extract_param(best,'loss-functions'),
     'neurons_HL1': extract_param(best,'neurons_HL1'),
     'neurons_HL2': extract_param(best,'neurons_HL2'),
     'neurons_HL3': extract_param(best,'neurons_HL3'),
     'dropout1': extract_param(best,'dropout1'),
     'dropout2': extract_param(best,'dropout2'),
     'batch_size': extract_param(best,'batch_size'),
     'epochs': extract_param(best,'epochs')
    }
    model = Sequential()

    model.add(Dense(parameters['neurons_HL1'], 
    input_shape=(train_x.shape[1],), 
    activation=parameters['activation_1'],use_bias=True))

    model.add(Dropout(parameters['dropout1']))

    model.add(Dense(parameters['neurons_HL2'], 
    activation=parameters['activation_2'], use_bias=True))

    model.add(Dropout(parameters['dropout1']))
    
    if parameters['neurons_HL3'] != 'None':
        model.add(Dense(parameters['neurons_HL3'], 
        activation=parameters['activation_3'], use_bias=True))


    model.add(Dense(1, activation='relu'))

    model.compile(optimizer=parameters['optimizer'], loss=parameters['loss-functions'], 
    metrics=['mse', 'mae'])
    
    return model

## Train retrieved model

In [12]:
my_model = KerasRegressor(build_fn=best_model,epochs=300, 
    batch_size=parameters['batch_size'], verbose=2)    
my_model.fit(train_x, train_y)


Epoch 1/300
 - 2s - loss: 0.0233 - mse: 0.0233 - mae: 0.1205
Epoch 2/300
 - 1s - loss: 0.0141 - mse: 0.0141 - mae: 0.0930
Epoch 3/300
 - 1s - loss: 0.0100 - mse: 0.0100 - mae: 0.0747
Epoch 4/300
 - 1s - loss: 0.0077 - mse: 0.0077 - mae: 0.0647
Epoch 5/300
 - 1s - loss: 0.0068 - mse: 0.0068 - mae: 0.0598
Epoch 6/300
 - 1s - loss: 0.0059 - mse: 0.0059 - mae: 0.0547
Epoch 7/300
 - 1s - loss: 0.0056 - mse: 0.0056 - mae: 0.0524
Epoch 8/300
 - 1s - loss: 0.0052 - mse: 0.0052 - mae: 0.0498
Epoch 9/300
 - 1s - loss: 0.0050 - mse: 0.0050 - mae: 0.0484
Epoch 10/300
 - 1s - loss: 0.0048 - mse: 0.0048 - mae: 0.0471
Epoch 11/300
 - 1s - loss: 0.0046 - mse: 0.0046 - mae: 0.0461
Epoch 12/300
 - 1s - loss: 0.0045 - mse: 0.0045 - mae: 0.0453
Epoch 13/300
 - 1s - loss: 0.0044 - mse: 0.0044 - mae: 0.0443
Epoch 14/300
 - 1s - loss: 0.0043 - mse: 0.0043 - mae: 0.0440
Epoch 15/300
 - 1s - loss: 0.0043 - mse: 0.0043 - mae: 0.0434
Epoch 16/300
 - 1s - loss: 0.0042 - mse: 0.0042 - mae: 0.0427
Epoch 17/300
 - 

Epoch 130/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0345
Epoch 131/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0350
Epoch 132/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0348
Epoch 133/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0346
Epoch 134/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0347
Epoch 135/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0346
Epoch 136/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0343
Epoch 137/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0344
Epoch 138/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0346
Epoch 139/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0343
Epoch 140/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0346
Epoch 141/300
 - 1s - loss: 0.0028 - mse: 0.0028 - mae: 0.0343
Epoch 142/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0344
Epoch 143/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae: 0.0346
Epoch 144/300
 - 1s - loss: 0.0028 - mse: 0.0028 - mae: 0.0344
Epoch 145/300
 - 1s - loss: 0.0029 - mse: 0.0029 - mae:

 - 0s - loss: 0.0027 - mse: 0.0027 - mae: 0.0332
Epoch 261/300
 - 1s - loss: 0.0026 - mse: 0.0026 - mae: 0.0331
Epoch 262/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0330
Epoch 263/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0331
Epoch 264/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0329
Epoch 265/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0328
Epoch 266/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0332
Epoch 267/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0329
Epoch 268/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0331
Epoch 269/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0329
Epoch 270/300
 - 0s - loss: 0.0025 - mse: 0.0025 - mae: 0.0328
Epoch 271/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0330
Epoch 272/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0330
Epoch 273/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0329
Epoch 274/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0330
Epoch 275/300
 - 0s - loss: 0.0026 - mse: 0.0026 - mae: 0.0332
Epoch 

<keras.callbacks.callbacks.History at 0x1b3cc1149c8>

## Find highest contributing features

In [None]:
from eli5.sklearn import PermutationImportance
import eli5

perm = PermutationImportance(my_model, random_state=1).fit(train_x,train_y)
weight_df = eli5.explain_weights_df(perm, feature_names = train_x.columns.tolist(), )
weight_df.nlargest(10,['weight'])['feature']

In [None]:
weight_df.nlargest(10,['weight'])['feature'].iloc[5]

## Evaluate

In [15]:
predictions = pd.DataFrame()
predictions['pred'] = my_model.predict(test_x)
predictions

Unnamed: 0,pred
0,0.141483
1,0.114634
2,0.145221
3,0.157579
4,0.095200
...,...
2285,0.042025
2286,0.034815
2287,0.050583
2288,0.364720


In [16]:
pd.DataFrame(test_y)

Unnamed: 0,totalpris
13745,0.110419
5325,0.092896
5953,0.137189
13204,0.146693
854,0.062473
...,...
13959,0.025329
1779,0.059920
3727,0.045829
4276,0.320869


In [17]:
def evaluate_prediction(predictions, test_y): 
    """ prints the importances of features 
    Args:
        predictions: predictions for tast set
        test_y:      test set labels
    Returns:
    """
    test_evaluation = predictions
    test_evaluation['benchmark'] = test_y.mean()
    test_evaluation['target'] = test_y.reset_index(drop=True)
    test_evaluation['difference'] = test_evaluation['pred'] - test_evaluation['target']
    test_evaluation['bench difference'] = test_evaluation['benchmark'] - test_evaluation['target']
    test_evaluation['abs difference'] = abs(test_evaluation['difference'])
    test_evaluation['abs bench difference'] = abs(test_evaluation['bench difference'])
    test_evaluation['difference %'] = (test_evaluation['pred'] / test_evaluation['target'] - 1) * 100
    test_evaluation['bench difference %'] = abs((test_evaluation['benchmark'] / test_evaluation['target'] - 1) * 100)
    
    mean = int(test_evaluation['abs difference'].mean())
    bench_mean = int(test_evaluation['abs bench difference'].mean())
    mean_perc = round(abs(test_evaluation['difference %']).mean(), 2)
    bench_mean_perc = round(abs(test_evaluation['bench difference %']).mean(), 2)
    print('Model evaluation campared to mean benchmark:', test_y.median())
    print(f'| mean abs.  difference | our model: {mean}  benchmark: {bench_mean}')
    print(f'| mean abs % difference | our model: {mean_perc} %  benchmark: {bench_mean_perc} %')
    print()
    
    return


In [18]:
evaluate_prediction(pd.DataFrame(predictions), pd.DataFrame(test_y))

ValueError: cannot convert float NaN to integer

## Performance plot

In [None]:
from matplotlib import pyplot
pd_y = pd.Series(train_y, predictions['pred'])

pyplot.hist(pd_y, bins=50, rwidth=0.9,alpha=0.5,
             label='totalpris')
pyplot.hist(predictions['pred'], bins=50, rwidth=0.9,alpha=0.5,
                     label='prediksjon')
pyplot.show()



In [None]:
price_pred = [inverse_transform(scaler, prediction) for prediction in predictions['pred']]
price_tot = [inverse_transform(scaler, price) for price in test_y]

In [None]:
pd.Series(price_pred)
#pyplot.hist(pd.Series(price_tot), bins=50,alpha=0.5,
#             label='totalpris')
#pyplot.hist(pd.Series(price_pred), bins=50, rwidth=0.9,alpha=0.5,
 #                    label='prediksjon')
#pyplot.show()