# Model v2.2 - Keras Version

In [1]:
import tensorflow as tf

In [2]:
tf.logging.set_verbosity(tf.logging.INFO)

In [3]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.externals import joblib

In [5]:
import pandas as pd
import numpy as np
import os
import math

### Load dataset

In [6]:
LABEL = "price_doc"

In [7]:
kaggle_test = pd.read_csv("/mnt/h/Kaggle/Competitions/Russian Bank/data/test.csv/test.csv")
macro_df = pd.read_csv("/mnt/h/Kaggle/Competitions/Russian Bank/data/macro.csv/macro.csv")
final_kaggle_df = pd.merge(kaggle_test, macro_df, on='timestamp')

In [8]:
final_kaggle_df[LABEL] = 0.0

In [9]:
final_train_df = pd.read_csv("/mnt/h/Kaggle/Competitions/Russian Bank/data/final_training_dataset.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
del final_train_df['Unnamed: 0']

In [11]:
for cl in [c for c in final_train_df.columns.tolist() if c not in final_kaggle_df.columns.tolist()]:
    del final_train_df[cl]

In [12]:
assert final_train_df.shape[1] == final_kaggle_df.shape[1]

In [13]:
final_mega_df = pd.concat([final_train_df, final_kaggle_df], ignore_index=True)

In [14]:
assert final_mega_df.shape[1] == final_train_df.shape[1] == final_kaggle_df.shape[1]

In [15]:
assert final_mega_df.shape[0] == final_train_df.shape[0] + final_kaggle_df.shape[0]

## Data prep

### Notes
- ID_* columns should be embedded!
- _1line are no/yes columns, so they should be labeled...
- "ecology" has values of ['good', 'excellent', 'poor', 'satisfactory', 'no data'],... labelled/one-hot encoded/or embedded?
- church_count_500 and other _count values integer values - bucketize them?
- timestamp_day, timestamp_month, timestamp_year

In [16]:
class SberbankData():
    
    def __init__(self, csv_path, save_dir, use_cols=[], blacklist_cols=['id'], load_serializers_from_disk=False, prepare_dataset=True, predict_dataset=False):
        self.csv_path = csv_path
        self.save_dir = save_dir
        self.use_cols = use_cols
        self.blacklist_cols = blacklist_cols
        self.predict_dataset = predict_dataset
        
        self.load_dataset()
        
        if prepare_dataset:
            self.prepare_dataset(load_serializers_from_disk)
            
    def clear_dataset(self):
        self.df = None
        
    def load_dataset(self, path=None):
        
        if type(path) == pd.core.frame.DataFrame:
            self.df = path
        else:
            if not path:
                path = self.csv_path
                if type(path) == pd.core.frame.DataFrame:
                    self.df = path
                else:
                    self.df = pd.read_csv(path)
            else:
                self.df = pd.read_csv(path)
        
    def _prepare_dataset(self, predict=False, trunc_fields=False):
        global LABEL
        t_fields = self.use_cols or ['id', 'timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor','material', 'build_year', 'num_room', 'kitch_sq', 'state', 'product_type', 'sub_area', LABEL]
    
        if not predict:
            if trunc_fields:
                self.df = self.df[t_fields + LABEL]
            else:
                self.df = self.df
        else:
            if trunc_fields:
                self.df = self.df[t_fields]
            else:
                self.df = self.df
            
        fix_cols = [c for c in self.df.columns.tolist() if '+' in c]
        
        for c in fix_cols:
            self.df[c.replace("+","_")] = self.df[c]
            del self.df[c]
            
        
        self.df.fillna(0, inplace=True)
        self.df.replace('#!',0, inplace=True)
    
        self.df['timestamp'] = pd.to_datetime(self.df['timestamp'])
        self.df['timestamp_day'] = self.df.apply(lambda row: row['timestamp'].day, axis=1)
        self.df['timestamp_month'] = self.df.apply(lambda row: row['timestamp'].month, axis=1)
        self.df['timestamp_year'] = self.df.apply(lambda row: row['timestamp'].year, axis=1)
        self.df['state'] = self.df.apply( lambda row: 5 if row['state'] == 33 else row['state'] , axis = 1).astype(int)
    
        
        self.df['material'] = self.df['material'].astype(int) 
        
        if 'Unnamed: 0' in self.df.columns.tolist():
            del self.df['Unnamed: 0']
            
    def get_bin_boundaries(self, df, col):
        step = int(df[col].std())
        if step == 0:
            return [float(df[col].min()), float(df[col].max())]
        else:
            return [float(g) for g in list(range( int(df[col].min()), int(df[col].max()), step))]
        
    def bucketize_column(self, df, col, bins=[]):
        if not bins:
            bins = self.get_bin_boundaries(df, col)
        return np.digitize(df[col].values, bins, right=True)
            
    def dedup(self, listy):
        return list(set(listy))
            
    def load_normalizers(self, force=False):
        if force:
            self.continuous_scalers = joblib.load(os.path.join(self.save_dir, 'normalizers', 'continuous_scalers.pkl'))
            self.label_scalers = joblib.load(os.path.join(self.save_dir, 'normalizers', 'label_scalers.pkl'))
            self.one_hot_scalers = joblib.load(os.path.join(self.save_dir, 'normalizers', 'onehot_scalers.pkl'))
        else:
            # Only load if there aren't already initialized.
            try:
                self.continuous_scalers
                self.label_scalers
                self.one_hot_scalers
            except AttributeError:
                # They don't exist, so let's load them.
                self.continuous_scalers = joblib.load(os.path.join(self.save_dir, 'normalizers', 'continuous_scalers.pkl'))
                self.label_scalers = joblib.load(os.path.join(self.save_dir, 'normalizers', 'label_scalers.pkl'))
                self.one_hot_scalers = joblib.load(os.path.join(self.save_dir, 'normalizers', 'onehot_scalers.pkl'))           
    
    def save_normalizers(self):
        joblib.dump(self.continuous_scalers, os.path.join(self.save_dir, 'normalizers', 'continuous_scalers.pkl'))
        joblib.dump(self.label_scalers, os.path.join(self.save_dir, 'normalizers', 'label_scalers.pkl'))
        joblib.dump(self.one_hot_scalers, os.path.join(self.save_dir, 'normalizers', 'onehot_scalers.pkl'))
    
    def _prepare_normalizers(self):
        # First, prepare the continuous columns scaler
        self.continuous_scalers = {}
        for col in self.all_continuous_columns:
            self.df[col] = [float(str(c).replace(",","")) for c in self.df[col].values.ravel().tolist()]
            if col in self.normalize_continuous_columns:
                scaler = preprocessing.StandardScaler()
                scaler.fit(self.df[col].values)
                self.continuous_scalers[col] = scaler
        
        # Second, prepare the label columns scaler
        self.label_scalers = {}
        for col in self.label_columns:
            scaler = preprocessing.LabelEncoder()
            scaler.fit(self.df[col].values.tolist() + [self._unknown_tag])
            self.label_scalers[col] = scaler
            
        # Before we create the one-hot scalers, we need to transform the bucketized features
        self._transform_bucketized_columns()
            
        # Third, prepare the one-hot columns scaler
        self.one_hot_scalers = {}
        for col in self.one_hot_columns:
            if col in self.bucketized_columns:
                num_bins = len(self.get_bin_boundaries(self.df, col)) + 1 # possible bucket ids are range(num_bins)
                oh_scaler = preprocessing.OneHotEncoder(n_values = num_bins, handle_unknown='ignore')
                oh_scaler.fit(np.array(self.df[col + self._bucketized_tag].values).reshape(-1, 1))
                self.one_hot_scalers[col] = (None, oh_scaler)
            else:
                l_scaler = preprocessing.LabelEncoder()
                out = l_scaler.fit_transform(self.df[col].values.tolist() + [self._unknown_tag])
                oh_scaler = preprocessing.OneHotEncoder(handle_unknown='ignore')
                oh_scaler.fit(np.array(out).reshape(-1, 1))
                self.one_hot_scalers[col] = (l_scaler, oh_scaler)
                
        self.save_normalizers()
    
    def _transform_bucketized_columns(self):
        for col in self.bucketized_columns:
            self.df[col + self._bucketized_tag] = self.bucketize_column(self.df, col)
            
    def _transform_continuous_columns(self):
        for col in self.all_continuous_columns:
            if col in self.normalize_continuous_columns:
                self.df[col + self._processed_tag] = self.continuous_scalers[col].transform([float(str(c).replace(",","")) for c in self.df[col].values.ravel().tolist()])
            else:
                # If we're not supposed to normalize this continuous column, then we simply keep the original values
                self.df[col + self._processed_tag] = [float(str(c).replace(",","")) for c in self.df[col].values.ravel().tolist()]
    
    
    def _clean_label_scaler_input(self, label_scaler, values):
        vals = []
        for val in values:
            if type(val) == str:
                if val in label_scaler.classes_:
                    vals.append(val)
                else:
                    vals.append(self._unknown_tag)
            elif type(val) == int:
                classes = [int(c) for c in label_scaler.classes_ if c != self._unknown_tag]
                if val in classes:
                    vals.append(val)
                else:
                    vals.append(self._unknown_tag)
        return vals
        
    def _transform_label_columns(self):
        for col in self.label_columns:
            l_scaler = self.label_scalers[col]
            # Replace unknown values with the unknown tag, in order to prevent problems with the label encoder
            # when it sees new values
            vals = self._clean_label_scaler_input(l_scaler, self.df[col].values.tolist())
            self.df[col + self._processed_tag] = l_scaler.transform(vals)
                
    def _transform_onehot_columns(self):
        for col in self.one_hot_columns:
            if col in self.bucketized_columns:
                # This is a bucketized column
                oh_scaler = self.one_hot_scalers[col][1]
                # We've already bucketized this column by this time,
                # so all we have to do now is to transform these bucketized values to a one-hot encoding
                self.df[col + self._bucketized_tag] = oh_scaler.transform(self.df[col + self._bucketized_tag].values.reshape(-1, 1)).toarray().tolist()
            else:
                # This is an ordinary one-hot column
                l_scaler, oh_scaler = self.one_hot_scalers[col]
                vals = self._clean_label_scaler_input(l_scaler, self.df[col].values.tolist())
                out = l_scaler.transform(vals)
                self.df[col + self._processed_tag] = oh_scaler.transform(np.array(out).reshape(-1, 1)).toarray().tolist()
                
    
    def transform_dataset(self):
        self._transform_bucketized_columns()     
        self._transform_continuous_columns()
        self._transform_label_columns()
        self._transform_onehot_columns()
        
    def _extract_processed_feature(self, df, feature_col):
        feature_col = feature_col + self._processed_tag
        vals = np.expand_dims(df[feature_col].values, axis=1)
        return vals
        
    def extract_model_input_data(self, df = None, predict=False, params={}):
        # Create an x_train, y_train pair of numpy arrays, that can be feed into a machine learning model (eg. deep learning, etc.)
        global LABEL
        
        # Can be set to either 'bucketized', 'continuous', or 'both
        bucketized_input_type = params.get("bucketized_input_type", 'both').lower().strip()
        
        inputs = []
        one_hot_inputs = {}
        embed_inputs = {}
        
        if isinstance(df, type(None)):
            df = self.df
        
        for col in self.all_feature_columns:
            
            if col not in self.embedding_columns + self.one_hot_columns_to_be_labeled:
                # This isn't an embedding column or one-hot column
                if col not in self.bucketized_columns:
                    val = np.expand_dims(self._extract_processed_feature(df, col), axis=1)
                    inputs.append(val) 
                    
                elif col in self.bucketized_columns:
                    
                    if bucketized_input_type == 'both':
                        
                        # If this column is also a bucketized column, also add the bucketized feature to this input list.
                        val = np.expand_dims(self._extract_processed_feature(df, col + self._bucketized_tag_header), axis=1)
                        if col in self.one_hot_columns:
                            val = np.squeeze(val, axis=2)
                            # Convert from object array to numeric array
                            val = np.array(list(val[:, 0]), dtype=np.float)
                            one_hot_inputs[col] = val # Store this one-hot representation in a separately maintained dictionary
                            
                            # Also add the original value for this bucketized column, since bucketized input type is 'both'
                            val = np.expand_dims(self._extract_processed_feature(df, col), axis=1)
                            inputs.append(val)
                        else:
                            inputs.append(val) # Not a one-hot represented column, so just add as is...
                            
                        
                    elif bucketized_input_type == 'continuous':
                        
                        val = np.expand_dims(self._extract_processed_feature(df, col), axis=1)
                        inputs.append(val)
                        
                    elif bucketized_input_type == 'bucketized':
                        
                        val = np.expand_dims(self._extract_processed_feature(df, col + self._bucketized_tag_header), axis=1)
                        if col in self.one_hot_columns:
                            val = np.squeeze(val, axis=2)
                            # Convert from object array to numeric array
                            val = np.array(list(val[:, 0]), dtype=np.float)
                            one_hot_inputs[col] = val # Store this one-hot representation in a separately maintained dictionary
                        else:
                            inputs.append(val)
                            
            else:
                if col in self.embedding_columns:
                    
                    # This is an embedding column
                    val = np.expand_dims(self._extract_processed_feature(df, col), axis=1)
                    val = np.squeeze(val, axis=2)
                    embed_inputs[col] = val
                    
                elif col in self.one_hot_columns_to_be_labeled:
                    
                    # This is a one-hot column
                    val = np.expand_dims(self._extract_processed_feature(df, col), axis=1)
                    val = np.squeeze(val, axis=2)
                    # Convert from object array to numeric array
                    val = np.array(list(val[:, 0]), dtype=np.float)
                    one_hot_inputs[col] = val
                    
                else:
                    raise Exception("Unimplemented feature column: " + col)
        
        # Combine all inputs
        inputs = np.hstack(tuple(inputs))
        inputs = np.squeeze(inputs, axis=2)
        
        if predict:
            return inputs, one_hot_inputs, embed_inputs
        
        outputs = np.expand_dims(df[LABEL].values, axis=1)
        return (inputs, one_hot_inputs, embed_inputs), outputs
            
        
    def make_train_validate_test_split(self, train_percent=.6, validate_percent=.2, seed=None, transform=False):
        
        if transform:
            self.transform_dataset()
            
        np.random.seed(seed)
        perm = np.random.permutation(self.df.index)
        m = len(self.df)
        train_end = int(train_percent * m)
        validate_end = int(validate_percent * m) + train_end
        train = self.df.ix[perm[:train_end]]
        validate = self.df.ix[perm[train_end:validate_end]]
        test = self.df.ix[perm[validate_end:]]
        return train, validate, test
    
    def make_train_test_split(self, train_percent=.8, test_percent=.2, seed=None, transform=False):
        if transform:
            self.transform_dataset()
        
        train_df, test_df = train_test_split(self.df, train_size=train_percent, test_size = test_percent)
        return train_df, test_df
    
            
    def _prepare_feature_columns(self, load_serializers_from_disk=False):
        """
        Notes
        ID_* columns should be embedded!
        _1line are no/yes columns, so they should be labeled...
        "ecology" has values of ['good', 'excellent', 'poor', 'satisfactory', 'no data'],... labelled/one-hot encoded/or embedded?
        church_count_500 and other _count values integer values - bucketize them?
        timestamp_day, timestamp_month, timestamp_year
        """
        def get_subcolumns_by_tag(tag):
            fts = []
            for f in self.all_feature_columns:
                if tag in f:
                    fts.append(f)
            return fts

        def get_raion_columns(return_yesno=True, return_cont=False):
            yesno_cols = []
            continuous_cols = []
            for c in get_subcolumns_by_tag(self._raion_tag):
                if any(w in self.df[c].unique().tolist() for w in ('yes', 'no')):
                    yesno_cols.append(c)
                else:
                    continuous_cols.append(c)
            if return_yesno:
                return yesno_cols
            elif return_cont:
                return continuous_cols
            else:
                return yesno_cols, continuous_cols
        
        self._id_tag = "ID_"
        self._1line_tag = "_1line"
        self._raion_tag = "_raion"
        self._count_tag = "_count"
        self._km_tag = "_km"
        self._sq_tag = "_sq"
        
        self._unknown_tag = "_unknown_tag"
        self._processed_tag = "_processed_feature"
        self._bucketized_tag_header = "_bucketized"
        self._bucketized_tag = self._bucketized_tag_header + self._processed_tag
        
        self.all_feature_columns = self.dedup([c for c in self.df.columns.tolist() if c not in self.blacklist_cols and self._processed_tag not in c])        
        
        # Ignore the _ID fields, as they seem to have little to no correlation with price, and thus, degrade model performance.
        self.blacklist_cols += get_subcolumns_by_tag(self._id_tag)
        
        self.all_feature_columns = self.dedup([c for c in self.df.columns.tolist() if c not in self.blacklist_cols and self._processed_tag not in c])        

        self.integerized_embedding_columns = []
        self.unlabeled_embedding_columns = ["sub_area"]

        self.embedding_columns = self.integerized_embedding_columns + self.unlabeled_embedding_columns

        # Note: This will not modify the existing feature column(s).
        # Rather, it will create a new supplemental feature column with bucketization
        # Also, bucketized columns will be one-hot encoded, after being label encoded...
        self.bucketized_columns = [
            # Specific columns to bucketize...
        ] + get_subcolumns_by_tag(self._count_tag) + get_subcolumns_by_tag(self._km_tag) + get_subcolumns_by_tag(self._sq_tag)

        self.label_columns = [
            # Specific label columns to include...    
        ] + get_raion_columns(return_yesno=True) + get_subcolumns_by_tag(self._1line_tag) + self.embedding_columns + [
            'product_type', 'culture_objects_top_25']
        
        # One-hot columns that need to be label encoded
        self.one_hot_columns_to_be_labeled = ['material','state',"ecology", ]
        
        self.one_hot_columns = self.one_hot_columns_to_be_labeled  + self.bucketized_columns
        
        # Don't normalize one-hot, label, and integerized embedding columns, although they contain numbers...
        self.exclude_normalize_continuous_columns = [
            # Specific columns to exclude from normalization...
            # TODO: Should we exclude timestamp_month, day, and year??
        ] + self.one_hot_columns + self.label_columns + self.integerized_embedding_columns
        
        self.normalize_continuous_columns = [c for c in self.all_feature_columns if c not in self.exclude_normalize_continuous_columns]
        self.all_continuous_columns = [c for c in self.all_feature_columns if c not in self.one_hot_columns_to_be_labeled + self.label_columns + self.embedding_columns]
        
        for int_emb_col in self.integerized_embedding_columns:
            self.df[int_emb_col] = self.df[int_emb_col].astype(int) # Convert all integer id embedding columns to int
            # as, some may be a float...
        
        # Only prepare the normalizers if this is a main training set (and NOT a predict set...)
        if not self.predict_dataset:
            if load_serializers_from_disk:
                self.load_normalizers(force=True)
            else:
                self._prepare_normalizers()
        else:
            self.load_normalizers(force=load_serializers_from_disk)
         
        
    def prepare_dataset(self, load_serializers_from_disk):
        self._prepare_dataset()
        self._prepare_feature_columns(load_serializers_from_disk) 

In [17]:
def determine_dimensions(num_unique, r = 0, k = 1):
    if r == 0:
        return int(math.log(num_unique, 2))
    else:
        return k * int(num_unique ** 1./4.)

In [18]:
sbankdata = SberbankData(final_mega_df, '/mnt/h/Kaggle/Competitions/Russian Bank/code/data')



In [19]:
sbankdata.load_dataset(final_train_df)

In [20]:
sbankdata.prepare_dataset(True)

In [21]:
sbankdata.transform_dataset()



In [22]:
train_set, test_set = sbankdata.make_train_test_split()

In [23]:
((train_inputs, train_onehot_inputs, train_embeddings_input), train_outputs) = sbankdata.extract_model_input_data(train_set, params = {
    'bucketized_input_type' : 'continuous'
})

In [24]:
train_inputs.shape, train_outputs.shape

((24376, 382), (24376, 1))

In [25]:
train_onehot_inputs.keys()

['state', 'material', 'ecology']

In [26]:
train_embeddings_input.keys()

['sub_area']

In [27]:
((test_inputs, test_onehot_inputs, test_embeddings_input), test_outputs) = sbankdata.extract_model_input_data(test_set, params = {
    'bucketized_input_type' : 'continuous'
})

In [28]:
test_inputs.shape, test_outputs.shape

((6095, 382), (6095, 1))

In [29]:
test_onehot_inputs.keys()

['state', 'material', 'ecology']

In [30]:
test_embeddings_input.keys()

['sub_area']

In [31]:
np.unique(train_embeddings_input['sub_area']).size

146

In [32]:
del final_mega_df, final_kaggle_df, final_train_df, kaggle_test

In [33]:
import gc
gc.collect()

23

In [34]:
# Clear some memory
sbankdata.clear_dataset()

## Model

In [35]:
version = 2.2
version_str = "v" + str(version)

In [36]:
from keras.layers import Dense, Input, Dropout, Concatenate, Embedding, Flatten
from keras.models import Model
from keras import backend as K
from keras.utils import plot_model

Using TensorFlow backend.


In [37]:
SPARE_CAPACITY = 1

In [38]:
def prepare_embeddings_header(embeddings_input_names, label_scalers):
    global SPARE_CAPACITY
    header = {}
    for e_name in embeddings_input_names:
        vocab_size = label_scalers[e_name].classes_.size + SPARE_CAPACITY
        dimensions = determine_dimensions(vocab_size)
        header[e_name] = (vocab_size, dimensions)
    return header

In [39]:
def prepare_categorical_header(categorical_input_names, one_hot_scalers, use_label_scalers=True, use_spare_capacity=False):
    global SPARE_CAPACITY
    header = {}
    for c_name in categorical_input_names:
        # No spare capacity for one-hot columns, since the total number of needed classes has already been computed
        # and encoded previously
        if not use_label_scalers:
            dimension = one_hot_scalers[c_name][1].n_values
        else:
            if use_spare_capacity:
                dimension = one_hot_scalers[c_name][0].classes_.size + SPARE_CAPACITY
            else:
                dimension = one_hot_scalers[c_name][0].classes_.size
        header[c_name] = dimension
    return header

In [40]:
def build_model(num_continuous_inputs, embeddings_header, categorical_header, hidden_units=[1024, 512, 256, 128], hidden_dropout=0.4, final_dropout=0.4):
    
    """
    embeddings_header is a dict mapping from embedding input name -> (vocab_size, embedding)
    categorical_header is a dict mapping from one-hot input name -> dimension (ie. number of possible categorical classes)
    """
    
    with tf.name_scope("realty_housing_price_model"):
        with tf.name_scope("inputs"):
            continuous_input = Input(shape=(num_continuous_inputs,), name="continuous_input")
            
            
            embedding_inputs = []
            embedding_container = {}
            
            for embedding_input_name, (vocab_size, embedding) in embeddings_header.items():
                e_input = Input(shape=(1,), name=embedding_input_name)
                embedding_inputs.append(e_input)
                embedding_container[embedding_input_name] = (e_input, vocab_size, embedding)
                
            one_hot_inputs = []
            
            for onehot_input_name, dimension in categorical_header.items():
                oh_input = Input(shape=(dimension,), name=onehot_input_name)
                one_hot_inputs.append(oh_input)
            
        with tf.name_scope("embeddings"):
            embeddings = []
            for e_input_name, (e_input, vocab_size, embedding) in embedding_container.items():
                emb = Embedding(output_dim=embedding, input_dim=vocab_size, input_length=1, name=e_input_name + "_embedding")(e_input)
                emb = Flatten()(emb)
                embeddings.append(emb)
                
        with tf.name_scope("merge_layer"):
            merged_input_vector = Concatenate()([continuous_input] + embeddings + one_hot_inputs)
            merged_input_vector = Dropout(0.25)(merged_input_vector)
            
        with tf.name_scope("fully_connected"):
            
            for idx, hidden_unit in enumerate(hidden_units):
                if idx == 0:
                    # First hidden unit
                    x = Dense(hidden_unit, name="fc_%d_%d" % (idx, hidden_unit), activation='elu')(merged_input_vector)
                else:
                    # the N-th hidden unit
                    x = Dense(hidden_unit, name="fc_%d_%d" % (idx, hidden_unit), activation='elu')(x)
                
                if idx != len(hidden_units) - 1:
                    # Not the last one.
                    x = Dropout(hidden_dropout, name="dropout_%d_%d" % (idx, hidden_dropout))(x)
                else:
                    # Last one
                    x = Dropout(final_dropout, name="dropout_%d_%d" % (idx, final_dropout))(x)

            output = Dense(1, activation='relu', name='output')(x)
            
        return [continuous_input] + embedding_inputs + one_hot_inputs, output            

In [41]:
embeddings_header = prepare_embeddings_header(train_embeddings_input.keys(), sbankdata.label_scalers)
categorical_header = prepare_categorical_header(train_onehot_inputs.keys(), sbankdata.one_hot_scalers)
num_continuous_inputs = train_inputs.shape[1]

In [42]:
embeddings_header

{'sub_area': (148, 7)}

In [43]:
categorical_header

{'ecology': 6, 'material': 8, 'state': 7}

In [44]:
inputs, outputs = build_model(num_continuous_inputs=num_continuous_inputs, embeddings_header=embeddings_header, categorical_header=categorical_header)
realty_price_model = Model(inputs=inputs, outputs=outputs)

In [45]:
def root_mean_squared_logarithmic_error(y_true, y_pred):
    y_pred_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    y_true_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(y_pred_log - y_true_log), axis = -1))

In [46]:
realty_price_model.compile(optimizer = "adam", loss = root_mean_squared_logarithmic_error)

In [47]:
realty_price_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
sub_area (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
sub_area_embedding (Embedding)   (None, 1, 7)          1036        sub_area[0][0]                   
____________________________________________________________________________________________________
continuous_input (InputLayer)    (None, 382)           0                                            
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 7)             0           sub_area_embedding[0][0]         
___________________________________________________________________________________________

In [62]:
plot_model(realty_price_model, to_file='/mnt/h/Kaggle/Competitions/Russian Bank/models/' + version_str + '/model_' + version_str + '.png')

OSError: [Errno 12] Cannot allocate memory

## Train the model

In [48]:
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping, ReduceLROnPlateau
from keras_tqdm import TQDMNotebookCallback

In [49]:
model_checkpoint = ModelCheckpoint(
    '/mnt/h/Kaggle/Competitions/Russian Bank/models/' + version_str + '/weights.{epoch:02d}-{val_loss:.3f}.hdf5', 
    monitor='val_loss', 
    save_best_only=True, 
    save_weights_only=False, 
    mode='auto')
early_stopping = EarlyStopping(monitor='val_loss',  patience=10, verbose=0, mode='auto')
tensorboard = TensorBoard(log_dir='/mnt/h/Kaggle/Competitions/Russian Bank/models/' + version_str + '/logs', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=5, embeddings_layer_names=None, embeddings_metadata=None)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
              patience=5, min_lr=0.001) 
callbacks = [model_checkpoint, early_stopping, tensorboard, TQDMNotebookCallback()]

In [50]:
epochs = 50
val_split = 0.3

In [51]:
training_history = realty_price_model.fit(x=dict({'continuous_input': train_inputs}.items() + train_embeddings_input.items() + train_onehot_inputs.items()),
          y=train_outputs,
          validation_split=val_split,
          shuffle = True,
          verbose = 0,
          epochs=epochs, callbacks = callbacks)

          17056/|/[loss: 0.726] 100%|| 17056/17063 [02:15<00:00, 278.44it/s]




## Evaluate model

In [73]:
realty_price_model.load_weights('/mnt/h/Kaggle/Competitions/Russian Bank/models/v2.1/keras_model/weights.08-0.388.hdf5')

In [68]:
testing_loss = realty_price_model.evaluate(x=dict({'continuous_input': test_inputs}.items() + test_embeddings_input.items() + test_onehot_inputs.items()), 
                                           y=test_outputs,
                                           verbose = 0)
print 'Testing loss: ', testing_loss

Testing loss:  0.385473721408


## Kaggle submission

In [32]:
kaggle_test = pd.read_csv("/mnt/h/Kaggle/Competitions/Russian Bank/data/test.csv/test.csv")

In [33]:
kaggle_test.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,30474,2015-07-01,39.0,20.7,2,9,1,1998.0,1,8.9,...,8,0,0,0,1,10,1,0,14,1
1,30475,2015-07-01,79.2,,8,17,1,0.0,3,1.0,...,4,1,1,0,2,11,0,1,12,1
2,30476,2015-07-01,40.5,25.1,3,5,2,1960.0,2,4.8,...,42,11,4,0,10,21,0,10,71,11
3,30477,2015-07-01,62.8,36.0,17,17,1,2016.0,2,62.8,...,1,1,2,0,0,10,0,0,2,0
4,30478,2015-07-01,40.0,40.0,17,17,1,0.0,1,1.0,...,5,1,1,0,2,12,0,1,11,1


In [34]:
macro_df = pd.read_csv("/mnt/h/Kaggle/Competitions/Russian Bank/data/macro.csv/macro.csv")

In [35]:
kaggle_test = pd.merge(kaggle_test, macro_df, on='timestamp')

In [40]:
default_product_type = "Investment"
kaggle_test['product_type'] = kaggle_test['product_type'].fillna(default_product_type)

In [42]:
sbankdata.load_dataset(kaggle_test)

In [43]:
sbankdata.prepare_dataset(load_serializers_from_disk=True)

In [44]:
sbankdata.transform_dataset()



In [45]:
(kaggle_test_inputs, kaggle_test_onehot_inputs, kaggle_test_embeddings_input) = sbankdata.extract_training_data(kaggle_test, predict=True)

In [46]:
del kaggle_test, kaggle_test_embeddings_input, kaggle_test_inputs, tkaggle_test_onehot_inputs

In [345]:
kaggle_preds = real_estate_price_model.predict(x=dict({
                    'main_input' : kaggle_split_df_combined_inputs
                }.items() + kaggle_split_df_embedding_features_final.items()))

In [346]:
kaggle_preds

array([[  3.07068355e+10],
       [  3.07081196e+10],
       [  3.07076506e+10],
       ..., 
       [  9.80176000e+06],
       [  1.45478860e+07],
       [  1.71274320e+07]], dtype=float32)

In [191]:
final_kaggle_df['price_doc'] = kaggle_preds

In [192]:
final_kaggle_df['price_doc']

0       4.093042e+10
1       4.093284e+10
2       4.093333e+10
3       4.093235e+10
4       4.092968e+10
5       4.093546e+10
6       4.093052e+10
7       4.092911e+10
8       4.093080e+10
9       4.093024e+10
10      4.093242e+10
11      4.092887e+10
12      4.092848e+10
13      4.092789e+10
14      4.093016e+10
15      4.093108e+10
16      4.093927e+10
17      4.093849e+10
18      4.093026e+10
19      4.093700e+10
20      4.093147e+10
21      4.093658e+10
22      4.093368e+10
23      4.093388e+10
24      4.093059e+10
25      4.093547e+10
26      4.093610e+10
27      4.093445e+10
28      4.092772e+10
29      4.093438e+10
            ...     
7632    1.344982e+07
7633    8.391098e+06
7634    1.654414e+07
7635    1.180624e+07
7636    6.630701e+06
7637    9.822717e+06
7638    7.509546e+06
7639    1.351762e+07
7640    1.002495e+07
7641    7.301856e+06
7642    1.362228e+07
7643    1.310399e+07
7644    1.443113e+07
7645    7.393820e+06
7646    1.267712e+07
7647    1.206970e+07
7648    1.271

In [193]:
4.093042e10

40930420000.0

In [271]:
final_kaggle_df[['id', 'price_doc']]

Unnamed: 0,id,price_doc
0,30474,5880385.00
1,30475,8747043.00
2,30476,6298690.00
3,30477,6858484.00
4,30478,5549552.00
5,30479,8727317.00
6,30480,4832307.00
7,30481,4594226.00
8,30482,5736204.00
9,30483,5148752.00


In [272]:
final_kaggle_df[['id', 'price_doc']].to_csv("/mnt/h/Kaggle/Competitions/Russian Bank/models/v" + str(version) + "/submission.csv", index=False) # output submission csv file