# Main Notebook

In [1]:
import os
import gc
import pytz
import operator
import numpy as np
import pickle as pkl
import xgboost as xgb
from time import sleep
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import math
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action = 'ignore', category = FutureWarning)
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import sys
sys.path.append('..')

time = datetime.now(pytz.timezone('Europe/Oslo')).strftime('%m.%d.%Y_%H.%M.%S')
print(f'Notebook initialized execution at {time}.')
#import xgboost as xgb

Notebook initialized execution at 03.04.2020_10.20.01.


## General Methods

In [2]:
def memory_optimization(dfs):
    for df in dfs:
        del df
    gc.collect()

## Split Training and Validation

In [18]:
def clean_and_encode(df):

    df = df.fillna(value = 0)

    fucked_cols = ['url', 'Kommunale avg.', 'Energimerking', 'Tomt', 'Utleiedel', 'Postadresse'] 
    fucked_cols = [col for col in fucked_cols if col in df.columns]
    df = df.drop(fucked_cols, axis=1)
    
    print(df.head(1))
    cat_col = ['Boligtype', 'Eieform']

    for col in cat_col:
        if(col in df.columns):
            df_dummies = pd.get_dummies(df[col], prefix=col)
            df = pd.concat([df, df_dummies], axis=1).drop([col], axis=1)

    print(df.head(1))
    return df




In [19]:
def split(df_train):
    train_X, validation_X = train_test_split(df_train, test_size = 0.2, random_state = 0)
    test_X, validation_X = train_test_split(validation_X, test_size = 0.5, random_state = 0)

    train_X = train_X.reset_index()
    validation_X = validation_X.reset_index()
    test_X = test_X.reset_index()
    target = 'Totalpris'

    train_y = train_X['Totalpris']
    #train_y = train_y.replace([np.inf, -np.inf], np.nan)
    train_y = train_y.reset_index()
    train_y = train_y.drop(['index'], axis = 1)
    validation_y = validation_X[target]
    #validation_y = validation_y.replace([np.inf, -np.inf], np.nan)
    validation_y = validation_y.reset_index()
    validation_y = validation_y.drop(['index'], axis = 1)
    test_y = test_X[target]
    #test_y = test_y.replace([np.inf, -np.inf], np.nan)
    test_y = test_y.reset_index()
    test_y = test_y.drop(['index'], axis = 1)

    train_X = train_X.drop(target, axis = 1)
    validation_X = validation_X.drop(target, axis = 1)
    test_X = test_X.drop(target, axis = 1)
    
    train_X = train_X.drop(['index'], axis = 1)
    validation_X = validation_X.drop(['index'], axis = 1)
    test_X = test_X.drop(['index'], axis = 1)

    
    return train_X, train_y, validation_X, validation_y, test_X, test_y

## Prepare Data

In [24]:
start_time = datetime.now()

df = pd.read_csv(f'../input/trondheimv3.csv')

df = clean_and_encode(df)

train_x, train_y, validation_x, validation_y, test_x, test_y= split(df)



   Omkostninger  Totalpris  Felleskost/mnd.  Soverom  Primærrom  Bruksareal  \
0         80320  2970320.0             2390      1.0       47.0        51.0   

   Etasje  Byggeår  parkering  fiber  kabel-tv  tg 0   tg 1   tg 2  vedovn  \
0     2.0     2017       True  False      True  True  False  False   False   

   varmepumpe  fjernvarme  terasse  utsikt  kjøkkenøy   hage  garderobe  \
0       False        True     True   False      False  False      False   

   oppusset  oppussingsobjekt   bod        lat       lon  Rom  Fellesgjeld  \
0      True             False  True  62.647635  8.713596  2.0            0   

   Fellesformue  Energikarakter  Oppvarmingskarakter  Telefon  \
0             0               1                    0      0.0   

   Felleskost/mnd. etter avdragsfri periode  Festeavgift  Grunnflate  \
0                                       0.0          0.0         0.0   

   Boligtype_Andre  Boligtype_Enebolig  Boligtype_Gårdsbruk/Småbruk  \
0                0           

## Normalize data

In [25]:
def normalize_dataset(data):
    x = data.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_scaled)

In [26]:
train_y = normalize_dataset(train_y)
train_x = normalize_dataset(train_x)
validation_x = normalize_dataset(validation_x)
validation_y = normalize_dataset(validation_y)

## Specify model

In [30]:
inputs = keras.Input(shape=(44,), name='digits')
x = layers.Dense(44, activation='relu', name='dense_1')(inputs)
x = layers.Dense(44, activation='relu', name='dense_2')(x)
outputs = layers.Dense(1, name='predictions')(x)

model = keras.Model(inputs=inputs, outputs=outputs)



In [31]:
model.compile(loss='mean_squared_error', optimizer='sgd')

## Train model

In [32]:
print('# Fit model on training data')
history = model.fit(train_x, train_y,
                    batch_size=64,
                    epochs=3,
                    validation_data=(validation_x, validation_y))

print('\nhistory dict:', history.history)

# Fit model on training data
Train on 478 samples, validate on 60 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

history dict: {'loss': [0.04702644987678677, 0.03637864001905818, 0.03275918257598099], 'val_loss': [0.05215558037161827, 0.04388635978102684, 0.039815742522478104]}


## Results

In [33]:
results = model.evaluate(validation_x, validation_y, batch_size=64)
print(results.shape)

()
