# Notebook for own Data

In [31]:
import os
import gc
import pytz
import operator
import numpy as np
import pickle as pkl
import xgboost as xgb
from time import sleep
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action = 'ignore', category = FutureWarning)
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import sys
sys.path.append('..')

time = datetime.now(pytz.timezone('Europe/Oslo')).strftime('%m.%d.%Y_%H.%M.%S')
print(f'Notebook initialized execution at {time}.')

Notebook initialized execution at 02.19.2020_15.13.12.


## General Methods

In [32]:
def memory_optimization(dfs):
    for df in dfs:
        del df
    gc.collect()

## Split Training and Validation

In [33]:
def clean_and_encode(df):

    df = df.fillna(value = 0)

    fucked_cols = ['url', 'Kommunale avg.', 'Energimerking', 'Tomt', 'Utleiedel', 'Postadresse'] 
    fucked_cols = [col for col in fucked_cols if col in df.columns]
    df = df.drop(fucked_cols, axis=1)
    
    print(df.head(1))
    cat_col = ['Boligtype', 'Eieform']

    for col in cat_col:
        df_dummies = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df, df_dummies], axis=1).drop([col], axis=1)

    print(df.head(1))
    return df



In [34]:
def split(df_train):
    train_X, validation_X = train_test_split(df_train, test_size = 0.2, random_state = 0)
    test_X, validation_X = train_test_split(validation_X, test_size = 0.5, random_state = 0)

    train_X = train_X.reset_index()
    validation_X = validation_X.reset_index()
    test_X = test_X.reset_index()
    target = 'Totalpris'

    train_y = train_X['Totalpris']
    #train_y = train_y.replace([np.inf, -np.inf], np.nan)
    train_y = train_y.reset_index()
    train_y = train_y.drop(['index'], axis = 1)
    validation_y = validation_X[target]
    #validation_y = validation_y.replace([np.inf, -np.inf], np.nan)
    validation_y = validation_y.reset_index()
    validation_y = validation_y.drop(['index'], axis = 1)
    test_y = test_X[target]
    #test_y = test_y.replace([np.inf, -np.inf], np.nan)
    test_y = test_y.reset_index()
    test_y = test_y.drop(['index'], axis = 1)

    train_X = train_X.drop(target, axis = 1)
    validation_X = validation_X.drop(target, axis = 1)
    test_X = test_X.drop(target, axis = 1)
    
    train_X = train_X.drop(['index'], axis = 1)
    validation_X = validation_X.drop(['index'], axis = 1)
    test_X = test_X.drop(['index'], axis = 1)

    
    return train_X, train_y, validation_X, validation_y, test_X, test_y

## XGB Training

In [35]:
def xgb_train( train_X, train_y, validation_X, validation_y):
    model_name_wrt = f'../models/model_finn.hdf5'

    xgb_model = xgb.XGBRegressor(base_score = 0.5, booster = 'gbtree', colsample_bylevel = 1,
                                 colsample_bytree = 1, gamma = 0, importance_type = 'gain',
                                 learning_rate = 0.1, max_delta_step = 0, max_depth = 9,
                                 min_child_weight = 1, missing = None, n_estimators = 10000, n_jobs = -1,
                                 nthread = None, objective = 'reg:squarederror', random_state = 101, reg_alpha = 2,
                                 reg_lambda = 0.2, scale_pos_weight = 1, seed = None, silent = False, subsample = 1)

    xgb_model.fit(train_X, train_y, eval_set = [(validation_X, validation_y)], eval_metric = 'mae', 
                  early_stopping_rounds = 32, verbose = True)   
    
    xgb_model.save_model(model_name_wrt)
    
    return xgb_model

In [36]:
def importance(xgb_model, train_X):
    input_features = train_X.columns.values
    feat_imp = xgb_model.feature_importances_
    np.split(feat_imp, len(input_features))
    
    feat_imp_dict = {}
    for i in range(0, len(input_features)):
        feat_imp_dict[feat_imp[i]] = input_features[i]

    sorted_feats = sorted(feat_imp_dict.items(), key = operator.itemgetter(0))
    for i in range(len(sorted_feats) - 1, 0, -1):
        print(sorted_feats[i])

## Prepare Data

In [37]:
start_time = datetime.now()

df = pd.read_csv(f'../input/trondheimv2.csv')

df = clean_and_encode(df)

train_X, train_y, validation_X, validation_y, test_X, test_y = split(df)

Omkostninger  Totalpris  Felleskost/mnd.  Boligtype          Eieform  \
0         80320  2970320.0             2390  Leilighet  Eier (Selveier)   

   Soverom  Primærrom  Bruksareal  Etasje  Byggeår  parkering  fiber  \
0      1.0       47.0        51.0     2.0     2017       True  False   

   kabel-tv  tg 0   tg 1   tg 2  vedovn  varmepumpe  fjernvarme  terasse  \
0      True  True  False  False   False       False        True     True   

   utsikt  kjøkkenøy   hage  garderobe  oppusset  oppussingsobjekt   bod  \
0   False      False  False      False      True             False  True   

         lat       lon  Rom  Fellesgjeld  Fellesformue  Energikarakter  \
0  62.647635  8.713596  2.0            0             0               1   

   Oppvarmingskarakter  Telefon  Felleskost/mnd. etter avdragsfri periode  \
0                    0      0.0                                       0.0   

   Festeavgift  Grunnflate  
0          0.0         0.0  
   Omkostninger  Totalpris  Felleskost/

## Train and predict

In [38]:
xgb_model = xgb_train(train_X, train_y, validation_X, validation_y)   

    

[0]	validation_0-mae:3.92888e+06
Will train until validation_0-mae hasn't improved in 32 rounds.
[1]	validation_0-mae:3.55379e+06
[2]	validation_0-mae:3.2043e+06
[3]	validation_0-mae:2.88295e+06
[4]	validation_0-mae:2.5998e+06
[5]	validation_0-mae:2.35004e+06
[6]	validation_0-mae:2.12714e+06
[7]	validation_0-mae:1.92373e+06
[8]	validation_0-mae:1.74485e+06
[9]	validation_0-mae:1.57884e+06
[10]	validation_0-mae:1.43472e+06
[11]	validation_0-mae:1.30944e+06
[12]	validation_0-mae:1.19819e+06
[13]	validation_0-mae:1.09962e+06
[14]	validation_0-mae:1.01572e+06
[15]	validation_0-mae:942348
[16]	validation_0-mae:881095
[17]	validation_0-mae:823888
[18]	validation_0-mae:771406
[19]	validation_0-mae:724181
[20]	validation_0-mae:686850
[21]	validation_0-mae:651044
[22]	validation_0-mae:618816
[23]	validation_0-mae:590444
[24]	validation_0-mae:564758
[25]	validation_0-mae:543318
[26]	validation_0-mae:525616
[27]	validation_0-mae:514752
[28]	validation_0-mae:506132
[29]	validation_0-mae:499821
[30

## Test

In [41]:

preds = pd.DataFrame(xgb_model.predict(test_X))
print(preds.shape)
print(test_y.shape)

print(preds.join(test_y))

(60, 1)
(60, 1)
              0   Totalpris
0    5336260.50   5307520.0
1    3184265.75   3140805.0
2    2824209.75   2565092.0
3    3166265.75   2872184.0
4    2981417.25   3270920.0
5    5358591.50   5130020.0
6    4740355.00   4412920.0
7    2658457.25   3003988.0
8    2444631.75   2332915.0
9    4398177.00   4569090.0
10   4312013.00   4303320.0
11   2655082.50   2447023.0
12   7736399.50   7996020.0
13   3093909.00   3373420.0
14   3561390.25   3302170.0
15   2874805.25   3087574.0
16   6095255.00   5642920.0
17   2404192.75   2281528.0
18   2428185.00   2455902.0
19   2842862.00   2748320.0
20   2352141.25   2245920.0
21   6058564.00   6111072.0
22   3221440.50   3569727.0
23   8743200.00   8720570.0
24   5073911.50   5020512.0
25   2824908.75   2339557.0
26   6393089.00   6356320.0
27   1295174.25   1096880.0
28   3435502.50   3038822.0
29  14296896.00  14351170.0
30   3036343.50   3188474.0
31   3175524.00   3175492.0
32   6119909.50   5639992.0
33   5627772.50   5628592.0
34  