# Main Notebook

In [17]:
import os
import gc
import pytz
import operator
import numpy as np
import pickle as pkl
import xgboost as xgb
from time import sleep
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import math
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action = 'ignore', category = FutureWarning)
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import sys
sys.path.append('..')

time = datetime.now(pytz.timezone('Europe/Oslo')).strftime('%m.%d.%Y_%H.%M.%S')
print(f'Notebook initialized execution at {time}.')
#import xgboost as xgb

Notebook initialized execution at 03.01.2020_02.05.59.


## General Methods

In [18]:
def memory_optimization(dfs):
    for df in dfs:
        del df
    gc.collect()

## Split Training and Validation

In [19]:
def clean_and_encode(df_train, df_test):

    cols_with_missing_train = [col for col in df_train.columns 
                                    if df_train[col].isnull().any()]
    cols_with_missing_test = [col for col in df_test.columns 
                                    if df_test[col].isnull().any()]

    list_of_missing_cols = list(set(cols_with_missing_train) | set(cols_with_missing_test)) 

    df_train = df_train.drop(list_of_missing_cols, axis=1)
    df_test = df_test.drop(list_of_missing_cols, axis=1)

    s = (df_train.dtypes == 'object')
    object_cols_train = list(s[s].index)
    s = (df_test.dtypes == 'object')
    object_cols_test = list(s[s].index)

    object_list = list(set(object_cols_train) | set(object_cols_test)) 

    label_encoder = LabelEncoder()

    for col in object_list:
        df_train[col] = label_encoder.fit_transform(df_train[col])
        df_test[col] = label_encoder.fit_transform(df_test[col])

    return df_train, df_test



In [20]:
def split(df_train):
    train_X, validation_X = train_test_split(df_train, test_size = 0.1, random_state = 0)

    train_X = train_X.reset_index()
    validation_X = validation_X.reset_index()

    train_y = train_X['SalePrice']
    train_y = train_y.replace([np.inf, -np.inf], np.nan)
    train_y = train_y.reset_index()
    train_y = train_y.drop(['index'], axis = 1)
    validation_y = validation_X['SalePrice']
    validation_y = validation_y.replace([np.inf, -np.inf], np.nan)
    validation_y = validation_y.reset_index()
    validation_y = validation_y.drop(['index'], axis = 1)

    train_X = train_X.drop('SalePrice', axis = 1)
    validation_X = validation_X.drop('SalePrice', axis = 1)
    
    train_X = train_X.drop(['index'], axis = 1)
    validation_X = validation_X.drop(['index'], axis = 1)
    return train_X, train_y, validation_X, validation_y

## Prepare Data

In [27]:
start_time = datetime.now()

df_train = pd.read_csv(f"../input/train.csv")
df_test  = pd.read_csv(f"../input/test.csv")

train_X, test_X = clean_and_encode(df_train, df_test)

train_x, train_y, validation_x, validation_y = split(train_X)


print(train_y.shape)


(1314, 1)


## Normalize data

In [28]:
def normalize_dataset(data):
    x = data.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_scaled)

In [30]:
train_y = normalize_dataset(train_y)
train_x = normalize_dataset(train_x)
validation_x = normalize_dataset(validation_x)
validation_y = normalize_dataset(validation_y)

Unnamed: 0,0
0,0.182005
1,0.080391
2,0.04583
3,0.169046
4,0.012772
5,0.008264
6,0.305668
7,0.092412
8,1.0
9,0.104132


## Specify model

In [32]:
inputs = keras.Input(shape=(46,), name='digits')
x = layers.Dense(46, activation='relu', name='dense_1')(inputs)
x = layers.Dense(46, activation='relu', name='dense_2')(x)
outputs = layers.Dense(1, name='predictions')(x)

model = keras.Model(inputs=inputs, outputs=outputs)



In [33]:
model.compile(loss='mean_squared_error', optimizer='sgd')

## Train model

In [34]:
print('# Fit model on training data')
history = model.fit(train_x, train_y,
                    batch_size=64,
                    epochs=3,
                    validation_data=(validation_x, validation_y))

print('\nhistory dict:', history.history)

# Fit model on training data
Train on 1314 samples, validate on 146 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

history dict: {'loss': [0.016630966168129462, 0.014822705433373672, 0.013583099115157963], 'val_loss': [0.01978886581651152, 0.0180108747763993, 0.01650419033910722]}


## Results

In [40]:
results = model.evaluate(validation_x, validation_y, batch_size=64)
print(results.shape)

()


0.00014098243204213373
