# Notebook for own Data

In [38]:
import os
import gc
import pytz
import operator
import numpy as np
import pickle as pkl
import xgboost as xgb
from time import sleep
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import datasets

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action = 'ignore', category = FutureWarning)
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import sys
sys.path.append('..')

time = datetime.now(pytz.timezone('Europe/Oslo')).strftime('%m.%d.%Y_%H.%M.%S')
print(f'Notebook initialized execution at {time}.')

Notebook initialized execution at 03.04.2020_13.13.13.


## General Methods

In [39]:
def memory_optimization(dfs):
    for df in dfs:
        del df
    gc.collect()

## XGB Training

In [40]:
def xgb_train( train_X, train_y, validation_X, validation_y):
    model_name_wrt = f'../models/model_finn.hdf5'

    xgb_model = xgb.XGBRegressor(base_score = 0.5, booster = 'gbtree', colsample_bylevel = 1,
                                 colsample_bytree = 1, gamma = 0, importance_type = 'gain',
                                 learning_rate = 0.1, max_delta_step = 0, max_depth = 9,
                                 min_child_weight = 1, missing = None, n_estimators = 10000, n_jobs = -1,
                                 nthread = None, objective = 'reg:squarederror', random_state = 101, reg_alpha = 2,
                                 reg_lambda = 0.2, scale_pos_weight = 1, seed = None, silent = False, subsample = 1)

    xgb_model.fit(train_X, train_y, eval_set = [(validation_X, validation_y)], eval_metric = 'mae', 
                  early_stopping_rounds = 32, verbose = True)   
    
    xgb_model.save_model(model_name_wrt)
    
    return xgb_model

In [41]:
def importance(xgb_model, train_X):
    input_features = train_X.columns.values
    feat_imp = xgb_model.feature_importances_
    np.split(feat_imp, len(input_features))
    
    feat_imp_dict = {}
    for i in range(0, len(input_features)):
        feat_imp_dict[feat_imp[i]] = input_features[i]

    sorted_feats = sorted(feat_imp_dict.items(), key = operator.itemgetter(0))
    for i in range(len(sorted_feats) - 1, 0, -1):
        print(sorted_feats[i])

## Prepare Data

In [42]:
start_time = datetime.now()

train_X, train_y, validation_X, validation_y, test_X, test_y = datasets.load(f'../input/finn.csv')

    Totalpris  Soverom  Primærrom  Bruksareal  Byggeår   Rom  parkering  \
0     1015800      4.0      133.0       138.0     1915   5.0       True   
1     1630920      7.0      201.0       205.0     1953  10.0       True   
2     2553472      5.0      147.0       248.0     1930   7.0       True   
3     1938622      3.0       99.0       145.0     1953   4.0       True   
4     2360739      3.0      106.0       106.0     2001   4.0       True   
5     4610320      6.0      103.0       103.0     1932   7.0       True   
6     3225747      2.0       83.0        83.0     2011   3.0       True   
7     2988142      4.0      198.0       224.0     1977   5.0       True   
8     2003755      1.0       60.0        68.0     1972   2.0       True   
9     1191072      2.0      116.0       155.0     1954   3.0      False   
10    2860920      5.0      166.0       205.0     1973   7.0       True   
11    2522742      1.0       35.0        35.0     1992   2.0      False   
12    4108522      2.0   

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Train and predict

In [43]:
xgb_model = xgb_train(train_X, train_y, validation_X, validation_y)   

    

[0]	validation_0-mae:2.6283e+06
Will train until validation_0-mae hasn't improved in 32 rounds.
[1]	validation_0-mae:2.35584e+06
[2]	validation_0-mae:2.12194e+06
[3]	validation_0-mae:1.91163e+06
[4]	validation_0-mae:1.72409e+06
[5]	validation_0-mae:1.55751e+06
[6]	validation_0-mae:1.40524e+06
[7]	validation_0-mae:1.28029e+06
[8]	validation_0-mae:1.17659e+06
[9]	validation_0-mae:1.08787e+06
[10]	validation_0-mae:1.01054e+06
[11]	validation_0-mae:946553
[12]	validation_0-mae:886867
[13]	validation_0-mae:838697
[14]	validation_0-mae:799305
[15]	validation_0-mae:769476
[16]	validation_0-mae:747111
[17]	validation_0-mae:728650
[18]	validation_0-mae:712024
[19]	validation_0-mae:698067
[20]	validation_0-mae:687250
[21]	validation_0-mae:678846
[22]	validation_0-mae:673324
[23]	validation_0-mae:667167
[24]	validation_0-mae:663638
[25]	validation_0-mae:659376
[26]	validation_0-mae:656145
[27]	validation_0-mae:655829
[28]	validation_0-mae:653880
[29]	validation_0-mae:654250
[30]	validation_0-mae:

## Test

In [50]:
df = pd.DataFrame()
df['pred'] = xgb_model.predict(test_X)
df['target'] = test_y.reset_index(drop=True)
df['difference'] = df['pred'] - df['target']
df['difference %'] = (df['pred'] / df['target'] - 1) * 100

print(df)

             pred    target    difference  difference %
0    1.706208e+06   4009844 -2.303636e+06    -57.449507
1    1.179106e+06    667502  5.116035e+05     76.644489
2    3.507506e+06   3680920 -1.734142e+05     -4.711166
3    2.581384e+06   2997271 -4.158870e+05    -13.875522
4    4.265617e+06   4990000 -7.243830e+05    -14.516693
5    4.575671e+06   3942262  6.334090e+05     16.067146
6    2.242098e+06   2769872 -5.277740e+05    -19.054093
7    2.763369e+06   2384494  3.788750e+05     15.889115
8    3.854082e+06   3486170  3.679120e+05     10.553473
9    1.966260e+06   2222632 -2.563715e+05    -11.534591
10   2.439904e+06   3602772 -1.162868e+06    -32.277049
11   1.696622e+06   1938420 -2.417985e+05    -12.473999
12   1.159519e+07  11185170  4.100160e+05      3.665711
13   2.038506e+06   2420170 -3.816642e+05    -15.770142
14   1.947334e+06   2348420 -4.010862e+05    -17.078983
15   3.530136e+06   3755152 -2.250162e+05     -5.992201
16   2.709698e+06   2363402  3.462955e+05     14