# Main Notebook

In [171]:
import os
import gc
import pytz
import operator
import numpy as np
import pickle as pkl
import xgboost as xgb
from time import sleep
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action = 'ignore', category = FutureWarning)
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning)

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import sys
sys.path.append('..')

time = datetime.now(pytz.timezone('Europe/Oslo')).strftime('%m.%d.%Y_%H.%M.%S')
print(f'Notebook initialized execution at {time}.')

Notebook initialized execution at 02.12.2020_14.38.28.


In [172]:
import xgboost as xgb

## General Methods

In [173]:
def memory_optimization(dfs):
    for df in dfs:
        del df
    gc.collect()

## Split Training and Validation

In [174]:
def clean_and_encode(df_train, df_test):

    df_train = df_train.fillna(value = 0)
    df_test = df_test.fillna(value = 0)

    fucked_cols = ['url', 'Kommunale avg.', 'Energimerking', 'Tomt', 'Utleiedel']
    df_train = df_train.drop(fucked_cols, axis=1)
    df_test = df_test.drop(fucked_cols, axis=1)
    
    print(df_train.head(30))

    label_encoder = LabelEncoder()

    for col in df_train.columns:
        df_train[col] = label_encoder.fit_transform(df_train[col])
        df_test[col] = label_encoder.fit_transform(df_test[col])

    return df_train, df_test



In [175]:
def split(df_train):
    train_X, validation_X = train_test_split(df_train, test_size = 0.1, random_state = 0)

    train_X = train_X.reset_index()
    validation_X = validation_X.reset_index()
    target = 'Totalpris'

    train_y = train_X['Totalpris']
    train_y = train_y.replace([np.inf, -np.inf], np.nan)
    train_y = train_y.reset_index()
    train_y = train_y.drop(['index'], axis = 1)
    validation_y = validation_X[target]
    validation_y = validation_y.replace([np.inf, -np.inf], np.nan)
    validation_y = validation_y.reset_index()
    validation_y = validation_y.drop(['index'], axis = 1)

    train_X = train_X.drop(target, axis = 1)
    validation_X = validation_X.drop(target, axis = 1)
    
    train_X = train_X.drop(['index'], axis = 1)
    validation_X = validation_X.drop(['index'], axis = 1)

    
    return train_X, train_y, validation_X, validation_y

## XGB Training

In [176]:
def xgb_train( train_X, train_y, validation_X, validation_y):
    model_name_wrt = f'../models/model.hdf5'

    xgb_model = xgb.XGBRegressor(base_score = 0.5, booster = 'gbtree', colsample_bylevel = 1,
                                 colsample_bytree = 1, gamma = 0, importance_type = 'gain',
                                 learning_rate = 0.1, max_delta_step = 0, max_depth = 9,
                                 min_child_weight = 1, missing = None, n_estimators = 10000, n_jobs = -1,
                                 nthread = None, objective = 'reg:squarederror', random_state = 101, reg_alpha = 2,
                                 reg_lambda = 0.2, scale_pos_weight = 1, seed = None, silent = False, subsample = 1)

    xgb_model.fit(train_X, train_y, eval_set = [(validation_X, validation_y)], eval_metric = 'mae', 
                  early_stopping_rounds = 32, verbose = True)   
    
    xgb_model.save_model(model_name_wrt)
    
    return xgb_model

In [177]:
def importance(xgb_model, train_X):
    input_features = train_X.columns.values
    feat_imp = xgb_model.feature_importances_
    np.split(feat_imp, len(input_features))
    
    feat_imp_dict = {}
    for i in range(0, len(input_features)):
        feat_imp_dict[feat_imp[i]] = input_features[i]

    sorted_feats = sorted(feat_imp_dict.items(), key = operator.itemgetter(0))
    for i in range(len(sorted_feats) - 1, 0, -1):
        print(sorted_feats[i])

## Prepare Data

In [178]:
start_time = datetime.now()

df_train = pd.read_csv(f'../input_finn/finn_30_rows.csv')
df_test  = pd.read_csv(f'../input_finn/finn_30_rows.csv')

train_X, test_X = clean_and_encode(df_train, df_test)

train_X, train_y, validation_X, validation_y = split(train_X)

Unnamed: 0                                      Postadresse  Omkostninger  \
0            0                 Langveien 3, 6509 Kristiansund N       22250.0   
1            1     Fridtjof Nansens gate 9, 6509 Kristiansund N       76092.0   
2            2  Kaptein Bødtkers gate 15 B, 6508 Kristiansund N       22420.0   
3            3                Skolegata 12, 6509 Kristiansund N           0.0   
4            4           Marstrands gate 2, 6508 Kristiansund N        8148.0   
5            5                 Hauggata 19, 6509 Kristiansund N       54972.0   
6            6        Helge Barmans gate 5, 6508 Kristiansund N       85902.0   
7            7                 Langveien 4, 6509 Kristiansund N        1162.0   
8            8  Kaptein Bødtkers gate 15 B, 6508 Kristiansund N       20920.0   
9            9                  Hauggata 7, 6509 Kristiansund N       54472.0   
10          10                 Fløyveien 5, 6508 Kristiansund N       80920.0   
11          11                 S

## Train and predict

In [179]:
xgb_model = xgb_train(train_X, train_y, validation_X, validation_y)   

    

[0]	validation_0-mae:9.98533
Will train until validation_0-mae hasn't improved in 32 rounds.
[1]	validation_0-mae:9.059
[2]	validation_0-mae:8.31076
[3]	validation_0-mae:7.62097
[4]	validation_0-mae:7.09435
[5]	validation_0-mae:6.47367
[6]	validation_0-mae:5.84762
[7]	validation_0-mae:5.45829
[8]	validation_0-mae:4.99031
[9]	validation_0-mae:4.67186
[10]	validation_0-mae:4.16321
[11]	validation_0-mae:3.72379
[12]	validation_0-mae:3.47449
[13]	validation_0-mae:3.29503
[14]	validation_0-mae:3.07264
[15]	validation_0-mae:2.88977
[16]	validation_0-mae:2.76909
[17]	validation_0-mae:2.61625
[18]	validation_0-mae:2.47381
[19]	validation_0-mae:2.34716
[20]	validation_0-mae:2.21945
[21]	validation_0-mae:2.08237
[22]	validation_0-mae:2.03209
[23]	validation_0-mae:1.94237
[24]	validation_0-mae:1.84399
[25]	validation_0-mae:1.75343
[26]	validation_0-mae:1.68317
[27]	validation_0-mae:1.61342
[28]	validation_0-mae:1.5706
[29]	validation_0-mae:1.5706
[30]	validation_0-mae:1.55954
[31]	validation_0-ma

## Make submission csv