# Main Notebook

In [13]:
import os
import gc
import pytz
import operator
import numpy as np
import pickle as pkl
import xgboost as xgb
from time import sleep
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

"""
import tensorflow as tf
from tensorflow.python.keras import callbacks
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model, load_model
from tensorflow.python.keras.losses import mean_absolute_error
from tensorflow.python.keras.layers import Dense, Input, Activation
from tensorflow.python.keras.layers import BatchNormalization, Add, Dropout
from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
from tensorflow.python.keras.optimizers import Adam, Adadelta, SGD
"""

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action = 'ignore', category = FutureWarning)
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import sys
sys.path.append('..')

time = datetime.now(pytz.timezone('Europe/Oslo')).strftime('%m.%d.%Y_%H.%M.%S')
print(f'Notebook initialized execution at {time}.')

Notebook initialized execution at 02.05.2020_14.46.18.


In [14]:
import xgboost as xgb

## General Methods

In [15]:
def memory_optimization(dfs):
    for df in dfs:
        del df
    gc.collect()

## Split Training and Validation

In [16]:
def clean_and_encode(df_train, df_test):

    cols_with_missing_train = [col for col in df_train.columns 
                                    if df_train[col].isnull().any()]
    cols_with_missing_test = [col for col in df_test.columns 
                                    if df_test[col].isnull().any()]

    list_of_missing_cols = list(set(cols_with_missing_train) | set(cols_with_missing_test)) 

    df_train = df_train.drop(list_of_missing_cols, axis=1)
    df_test = df_test.drop(list_of_missing_cols, axis=1)

    s = (df_train.dtypes == 'object')
    object_cols_train = list(s[s].index)
    s = (df_test.dtypes == 'object')
    object_cols_test = list(s[s].index)

    object_list = list(set(object_cols_train) | set(object_cols_test)) 

    label_encoder = LabelEncoder()

    for col in object_list:
        df_train[col] = label_encoder.fit_transform(df_train[col])
        df_test[col] = label_encoder.fit_transform(df_test[col])

    return df_train, df_test



In [17]:
def split(df_train):
    train_X, validation_X = train_test_split(df_train, test_size = 0.1, random_state = 0)

    train_X = train_X.reset_index()
    validation_X = validation_X.reset_index()

    train_y = train_X['SalePrice']
    train_y = train_y.replace([np.inf, -np.inf], np.nan)
    train_y = train_y.reset_index()
    train_y = train_y.drop(['index'], axis = 1)
    validation_y = validation_X['SalePrice']
    validation_y = validation_y.replace([np.inf, -np.inf], np.nan)
    validation_y = validation_y.reset_index()
    validation_y = validation_y.drop(['index'], axis = 1)

    train_X = train_X.drop('SalePrice', axis = 1)
    validation_X = validation_X.drop('SalePrice', axis = 1)
    
    train_X = train_X.drop(['index'], axis = 1)
    validation_X = validation_X.drop(['index'], axis = 1)
    
    return train_X, train_y, validation_X, validation_y

## XGB Training

In [18]:
def xgb_train( train_X, train_y, validation_X, validation_y):
    model_name_wrt = f'../models/model.hdf5'

    xgb_model = xgb.XGBRegressor(base_score = 0.5, booster = 'gbtree', colsample_bylevel = 1,
                                 colsample_bytree = 1, gamma = 0, importance_type = 'gain',
                                 learning_rate = 0.1, max_delta_step = 0, max_depth = 9,
                                 min_child_weight = 1, missing = None, n_estimators = 10000, n_jobs = -1,
                                 nthread = None, objective = 'reg:squarederror', random_state = 101, reg_alpha = 2,
                                 reg_lambda = 0.2, scale_pos_weight = 1, seed = None, silent = False, subsample = 1)

    xgb_model.fit(train_X, train_y, eval_set = [(validation_X, validation_y)], eval_metric = 'mae', 
                  early_stopping_rounds = 32, verbose = True)   
    
    xgb_model.save_model(model_name_wrt)
    
    return xgb_model

In [19]:
def importance(xgb_model, train_X):
    input_features = train_X.columns.values
    feat_imp = xgb_model.feature_importances_
    np.split(feat_imp, len(input_features))
    
    feat_imp_dict = {}
    for i in range(0, len(input_features)):
        feat_imp_dict[feat_imp[i]] = input_features[i]

    sorted_feats = sorted(feat_imp_dict.items(), key = operator.itemgetter(0))
    for i in range(len(sorted_feats) - 1, 0, -1):
        print(sorted_feats[i])

## Prepare Data

In [20]:
start_time = datetime.now()

df_train = pd.read_csv(f'../input/train.csv')
df_test  = pd.read_csv(f'../input/test.csv')

train_X, test_X = clean_and_encode(df_train, df_test)

train_X, train_y, validation_X, validation_y = split(train_X)

## Train and predict

In [21]:
xgb_model = xgb_train(train_X, train_y, validation_X, validation_y)   

print('Predicting')
test_predict = xgb_model.predict(test_X)
print('Predicted')
    

[0]	validation_0-mae:162195
Will train until validation_0-mae hasn't improved in 32 rounds.
[1]	validation_0-mae:146314
[2]	validation_0-mae:131912
[3]	validation_0-mae:119048
[4]	validation_0-mae:107330
[5]	validation_0-mae:96978.7
[6]	validation_0-mae:87810.2
[7]	validation_0-mae:79563
[8]	validation_0-mae:72097.9
[9]	validation_0-mae:65078.4
[10]	validation_0-mae:58966
[11]	validation_0-mae:53678
[12]	validation_0-mae:48882.4
[13]	validation_0-mae:44535.5
[14]	validation_0-mae:40856.5
[15]	validation_0-mae:37699.7
[16]	validation_0-mae:34819.1
[17]	validation_0-mae:32369.8
[18]	validation_0-mae:30140.5
[19]	validation_0-mae:28294.3
[20]	validation_0-mae:26664.8
[21]	validation_0-mae:25251.4
[22]	validation_0-mae:23923.2
[23]	validation_0-mae:22865.1
[24]	validation_0-mae:21902.3
[25]	validation_0-mae:21025.6
[26]	validation_0-mae:20394.2
[27]	validation_0-mae:19784.7
[28]	validation_0-mae:19167.4
[29]	validation_0-mae:18691.7
[30]	validation_0-mae:18262.3
[31]	validation_0-mae:17888

## Make submission csv

In [22]:
def submit(predictions):
    submit = pd.read_csv('../submissions/submission.csv')  
    submit['SalePrice'] = predictions
    
    time = datetime.now(pytz.timezone('Europe/Oslo')).strftime('%m.%d.%Y_%H.%M.%S')
    submit.to_csv(f'../submissions/submission_{time}.csv', index = False)

In [23]:
submit(test_predict)

print(f'Notebook EoF reached at {time} and submission saved.')

Notebook EoF reached at 02.05.2020_14.46.18 and submission saved.
