# Model v2.1

In [3]:
from keras.layers import Input, LSTM, Dense, Dropout, Embedding, TimeDistributed, Reshape, Flatten
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers.noise import GaussianDropout, GaussianNoise

In [4]:
from keras.models import Model, Sequential
from keras.layers.merge import Concatenate

In [5]:
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

In [6]:
import tensorflow as tf
from keras import backend as K

In [7]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [8]:
from sklearn_pandas import DataFrameMapper

In [9]:
import pandas as pd
import numpy as np

In [10]:
from keras.utils.np_utils import to_categorical

In [11]:
version = 2.1
version_str = "v" + str(version)

In [12]:
LABEL = "price_doc"

In [13]:
def prepare_dataset(df, test=False, trunc_fields=False):
    global LABEL
    t_fields = ['id', 'timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor','material', 'build_year', 'num_room', 'kitch_sq', 'state', 'product_type', 'sub_area', LABEL]
    
    if not test:
        if trunc_fields:
            df = df[t_fields + LABEL]
        else:
            df = df
    else:
        if trunc_fields:
            df = df[t_fields]
        else:
            df = df
    
    df['timestamp_converted'] = pd.to_datetime(df['timestamp'])
    df['timestamp_day'] = df.apply(lambda row: row['timestamp_converted'].day, axis=1)
    df['timestamp_month'] = df.apply(lambda row: row['timestamp_converted'].month, axis=1)
    df['timestamp_year'] = df.apply(lambda row: row['timestamp_converted'].year, axis=1)
    df['state_refractored'] = df.apply( lambda row: 5 if row['state'] == 33 else row['state'] , axis = 1)
    
    df.fillna(0, inplace=True)
    
    df['state_refractored'] = df['state_refractored'].astype(int)    
    df['material'] = df['material'].astype(int)    
    return df

In [247]:
final_train_df = pd.read_csv("/mnt/h/Kaggle/Competitions/Russian Bank/data/final_training_dataset.csv")

In [248]:
final_train_df = prepare_dataset(final_train_df)

In [249]:
del final_train_df['Unnamed: 0']

In [250]:
final_train_df.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,provision_retail_space_modern_sqm,turnover_catering_per_cap,theaters_viewers_per_1000_cap,seats_theather_rfmin_per_100000_cap,museum_visitis_per_100_cap,bandwidth_sports,population_reg_sports_share,students_reg_sports_share,apartment_build,apartment_fund_sqm
0,1,2011-08-20,43,27.0,4.0,0.0,0,0.0,0.0,0.0,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0
1,2,2011-08-23,34,19.0,3.0,0.0,0,0.0,0.0,0.0,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0
2,3,2011-08-27,43,29.0,2.0,0.0,0,0.0,0.0,0.0,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0
3,4,2011-09-01,89,50.0,9.0,0.0,0,0.0,0.0,0.0,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0
4,5,2011-09-05,77,77.0,4.0,0.0,0,0.0,0.0,0.0,...,271.0,6943.0,565.0,0.45356,1240.0,269768.0,22.37,64.12,23587.0,230310.0


## Data prep

In [251]:
def get_bin_boundaries(df, col):
    return np.arange(df[col].min(), df[col].max(), df[col].std())

In [252]:
def bucketize_column(df, col, bins=[]):
    if not bins:
        bins = get_bin_boundaries(df, col)
        if bins.size == 0:
            bins = [df[col].min(), df[col].max()]
    return np.digitize(df[col].values, bins, right=True)

### Notes
- ID_* columns should be embedded!
- _1line are no/yes columns, so they should be labeled...
- "ecology" has values of ['good', 'excellent', 'poor', 'satisfactory', 'no data'],... labelled/one-hot encoded/or embedded?
- church_count_500 and other _count values integer values - bucketize them?
- timestamp_day, timestamp_month, timestamp_year

In [253]:
import math
def determine_dimensions(num_unique, r = 0, k = 1):
    if r == 0:
        return int(math.log(num_unique, 2))
    else:
        return k * int(num_unique ** 1./4.)

In [254]:
def dedup(listy):
    return list(set(listy))

In [255]:
all_feature_columns = [
 'full_sq',
 'life_sq',
 'floor',
 'max_floor',
 'material',
 'build_year',
 'num_room',
 'kitch_sq',
 'state',
 'product_type',
 'sub_area',
 'area_m',
 'raion_popul',
 'green_zone_part',
 'indust_part',
 'children_preschool',
 'preschool_quota',
 'preschool_education_centers_raion',
 'children_school',
 'school_quota',
 'school_education_centers_raion',
 'school_education_centers_top_20_raion',
 'hospital_beds_raion',
 'healthcare_centers_raion',
 'university_top_20_raion',
 'sport_objects_raion',
 'additional_education_raion',
 'culture_objects_top_25',
 'culture_objects_top_25_raion',
 'shopping_centers_raion',
 'office_raion',
 'thermal_power_plant_raion',
 'incineration_raion',
 'oil_chemistry_raion',
 'radiation_raion',
 'railroad_terminal_raion',
 'big_market_raion',
 'nuclear_reactor_raion',
 'detention_facility_raion',
 'full_all',
 'male_f',
 'female_f',
 'young_all',
 'young_male',
 'young_female',
 'work_all',
 'work_male',
 'work_female',
 'ekder_all',
 'ekder_male',
 'ekder_female',
 '0_6_all',
 '0_6_male',
 '0_6_female',
 '7_14_all',
 '7_14_male',
 '7_14_female',
 '0_17_all',
 '0_17_male',
 '0_17_female',
 '16_29_all',
 '16_29_male',
 '16_29_female',
 '0_13_all',
 '0_13_male',
 '0_13_female',
 'raion_build_count_with_material_info',
 'build_count_block',
 'build_count_wood',
 'build_count_frame',
 'build_count_brick',
 'build_count_monolith',
 'build_count_panel',
 'build_count_foam',
 'build_count_slag',
 'build_count_mix',
 'raion_build_count_with_builddate_info',
 'build_count_before_1920',
 'build_count_1921-1945',
 'build_count_1946-1970',
 'build_count_1971-1995',
 'build_count_after_1995',
 'ID_metro',
 'metro_min_avto',
 'metro_km_avto',
 'metro_min_walk',
 'metro_km_walk',
 'kindergarten_km',
 'school_km',
 'park_km',
 'green_zone_km',
 'industrial_km',
 'water_treatment_km',
 'cemetery_km',
 'incineration_km',
 'railroad_station_walk_km',
 'railroad_station_walk_min',
 'ID_railroad_station_walk',
 'railroad_station_avto_km',
 'railroad_station_avto_min',
 'ID_railroad_station_avto',
 'public_transport_station_km',
 'public_transport_station_min_walk',
 'water_km',
 'water_1line',
 'mkad_km',
 'ttk_km',
 'sadovoe_km',
 'bulvar_ring_km',
 'kremlin_km',
 'big_road1_km',
 'ID_big_road1',
 'big_road1_1line',
 'big_road2_km',
 'ID_big_road2',
 'railroad_km',
 'railroad_1line',
 'zd_vokzaly_avto_km',
 'ID_railroad_terminal',
 'bus_terminal_avto_km',
 'ID_bus_terminal',
 'oil_chemistry_km',
 'nuclear_reactor_km',
 'radiation_km',
 'power_transmission_line_km',
 'thermal_power_plant_km',
 'ts_km',
 'big_market_km',
 'market_shop_km',
 'fitness_km',
 'swim_pool_km',
 'ice_rink_km',
 'stadium_km',
 'basketball_km',
 'hospice_morgue_km',
 'detention_facility_km',
 'public_healthcare_km',
 'university_km',
 'workplaces_km',
 'shopping_centers_km',
 'office_km',
 'additional_education_km',
 'preschool_km',
 'big_church_km',
 'church_synagogue_km',
 'mosque_km',
 'theater_km',
 'museum_km',
 'exhibition_km',
 'catering_km',
 'ecology',
 'green_part_500',
 'prom_part_500',
 'office_count_500',
 'office_sqm_500',
 'trc_count_500',
 'trc_sqm_500',
 'cafe_count_500',
 'cafe_sum_500_min_price_avg',
 'cafe_sum_500_max_price_avg',
 'cafe_avg_price_500',
 'cafe_count_500_na_price',
 'cafe_count_500_price_500',
 'cafe_count_500_price_1000',
 'cafe_count_500_price_1500',
 'cafe_count_500_price_2500',
 'cafe_count_500_price_4000',
 'cafe_count_500_price_high',
 'big_church_count_500',
 'church_count_500',
 'mosque_count_500',
 'leisure_count_500',
 'sport_count_500',
 'market_count_500',
 'green_part_1000',
 'prom_part_1000',
 'office_count_1000',
 'office_sqm_1000',
 'trc_count_1000',
 'trc_sqm_1000',
 'cafe_count_1000',
 'cafe_sum_1000_min_price_avg',
 'cafe_sum_1000_max_price_avg',
 'cafe_avg_price_1000',
 'cafe_count_1000_na_price',
 'cafe_count_1000_price_500',
 'cafe_count_1000_price_1000',
 'cafe_count_1000_price_1500',
 'cafe_count_1000_price_2500',
 'cafe_count_1000_price_4000',
 'cafe_count_1000_price_high',
 'big_church_count_1000',
 'church_count_1000',
 'mosque_count_1000',
 'leisure_count_1000',
 'sport_count_1000',
 'market_count_1000',
 'green_part_1500',
 'prom_part_1500',
 'office_count_1500',
 'office_sqm_1500',
 'trc_count_1500',
 'trc_sqm_1500',
 'cafe_count_1500',
 'cafe_sum_1500_min_price_avg',
 'cafe_sum_1500_max_price_avg',
 'cafe_avg_price_1500',
 'cafe_count_1500_na_price',
 'cafe_count_1500_price_500',
 'cafe_count_1500_price_1000',
 'cafe_count_1500_price_1500',
 'cafe_count_1500_price_2500',
 'cafe_count_1500_price_4000',
 'cafe_count_1500_price_high',
 'big_church_count_1500',
 'church_count_1500',
 'mosque_count_1500',
 'leisure_count_1500',
 'sport_count_1500',
 'market_count_1500',
 'green_part_2000',
 'prom_part_2000',
 'office_count_2000',
 'office_sqm_2000',
 'trc_count_2000',
 'trc_sqm_2000',
 'cafe_count_2000',
 'cafe_sum_2000_min_price_avg',
 'cafe_sum_2000_max_price_avg',
 'cafe_avg_price_2000',
 'cafe_count_2000_na_price',
 'cafe_count_2000_price_500',
 'cafe_count_2000_price_1000',
 'cafe_count_2000_price_1500',
 'cafe_count_2000_price_2500',
 'cafe_count_2000_price_4000',
 'cafe_count_2000_price_high',
 'big_church_count_2000',
 'church_count_2000',
 'mosque_count_2000',
 'leisure_count_2000',
 'sport_count_2000',
 'market_count_2000',
 'green_part_3000',
 'prom_part_3000',
 'office_count_3000',
 'office_sqm_3000',
 'trc_count_3000',
 'trc_sqm_3000',
 'cafe_count_3000',
 'cafe_sum_3000_min_price_avg',
 'cafe_sum_3000_max_price_avg',
 'cafe_avg_price_3000',
 'cafe_count_3000_na_price',
 'cafe_count_3000_price_500',
 'cafe_count_3000_price_1000',
 'cafe_count_3000_price_1500',
 'cafe_count_3000_price_2500',
 'cafe_count_3000_price_4000',
 'cafe_count_3000_price_high',
 'big_church_count_3000',
 'church_count_3000',
 'mosque_count_3000',
 'leisure_count_3000',
 'sport_count_3000',
 'market_count_3000',
 'green_part_5000',
 'prom_part_5000',
 'office_count_5000',
 'office_sqm_5000',
 'trc_count_5000',
 'trc_sqm_5000',
 'cafe_count_5000',
 'cafe_sum_5000_min_price_avg',
 'cafe_sum_5000_max_price_avg',
 'cafe_avg_price_5000',
 'cafe_count_5000_na_price',
 'cafe_count_5000_price_500',
 'cafe_count_5000_price_1000',
 'cafe_count_5000_price_1500',
 'cafe_count_5000_price_2500',
 'cafe_count_5000_price_4000',
 'cafe_count_5000_price_high',
 'big_church_count_5000',
 'church_count_5000',
 'mosque_count_5000',
 'leisure_count_5000',
 'sport_count_5000',
 'market_count_5000',
 'timestamp_day',
 'timestamp_month',
 'timestamp_year',
 'state_refractored',
 'oil_urals',
 'gdp_quart',
 'gdp_quart_growth',
 'cpi',
 'ppi',
 'gdp_deflator',
 'balance_trade',
 'balance_trade_growth',
 'usdrub',
 'eurrub',
 'brent',
 'net_capital_export',
 'gdp_annual',
 'gdp_annual_growth',
 'average_provision_of_build_contract',
 'average_provision_of_build_contract_moscow',
 'rts',
 'micex',
 'micex_rgbi_tr',
 'micex_cbi_tr',
 'deposits_value',
 'deposits_growth',
 'deposits_rate',
 'mortgage_value',
 'mortgage_growth',
 'mortgage_rate',
 'grp',
 'grp_growth',
 'income_per_cap',
 'real_dispos_income_per_cap_growth',
 'salary',
 'salary_growth',
 'fixed_basket',
 'retail_trade_turnover',
 'retail_trade_turnover_per_cap',
 'retail_trade_turnover_growth',
 'labor_force',
 'unemployment',
 'employment',
 'invest_fixed_capital_per_cap',
 'invest_fixed_assets',
 'profitable_enterpr_share',
 'unprofitable_enterpr_share',
 'share_own_revenues',
 'overdue_wages_per_cap',
 'fin_res_per_cap',
 'marriages_per_1000_cap',
 'divorce_rate',
 'construction_value',
 'invest_fixed_assets_phys',
 'pop_natural_increase',
 'pop_migration',
 'pop_total_inc',
 'childbirth',
 'mortality',
 'housing_fund_sqm',
 'lodging_sqm_per_cap',
 'water_pipes_share',
 'baths_share',
 'sewerage_share',
 'gas_share',
 'hot_water_share',
 'electric_stove_share',
 'heating_share',
 'old_house_share',
 'average_life_exp',
 'infant_mortarity_per_1000_cap',
 'perinatal_mort_per_1000_cap',
 'incidence_population',
 'rent_price_4+room_bus',
 'rent_price_3room_bus',
 'rent_price_2room_bus',
 'rent_price_1room_bus',
 'rent_price_3room_eco',
 'rent_price_2room_eco',
 'rent_price_1room_eco',
 'load_of_teachers_preschool_per_teacher',
 'child_on_acc_pre_school',
 'load_of_teachers_school_per_teacher',
 'students_state_oneshift',
 'modern_education_share',
 'old_education_build_share',
 'provision_doctors',
 'provision_nurse',
 'load_on_doctors',
 'power_clinics',
 'hospital_beds_available_per_cap',
 'hospital_bed_occupancy_per_year',
 'provision_retail_space_sqm',
 'provision_retail_space_modern_sqm',
 'turnover_catering_per_cap',
 'theaters_viewers_per_1000_cap',
 'seats_theather_rfmin_per_100000_cap',
 'museum_visitis_per_100_cap',
 'bandwidth_sports',
 'population_reg_sports_share',
 'students_reg_sports_share',
 'apartment_build',
 'apartment_fund_sqm']

"""
Notes
ID_* columns should be embedded!
_1line are no/yes columns, so they should be labeled...
"ecology" has values of ['good', 'excellent', 'poor', 'satisfactory', 'no data'],... labelled/one-hot encoded/or embedded?
church_count_500 and other _count values integer values - bucketize them?
timestamp_day, timestamp_month, timestamp_year
"""

all_feature_columns = dedup(all_feature_columns)

_id_tag = "ID_"
_1line_tag = "_1line"
_raion_tag = "_raion"
_count_tag = "_count"
_km_tag = "_km"
_sq_tag = "_sq"

def get_subcolumns_by_tag(tag):
    global all_feature_columns
    fts = []
    for f in all_feature_columns:
        if tag in f:
            fts.append(f)
    return fts

def get_raion_columns(return_yesno=True, return_cont=False):
    global _raion_tag
    yesno_cols = []
    continuous_cols = []
    for c in get_subcolumns_by_tag(_raion_tag):
        if any(w in final_train_df[c].unique().tolist() for w in ['yes', 'no']):
            yesno_cols.append(c)
        else:
            continuous_cols.append(c)
    if return_yesno:
        return yesno_cols
    elif return_cont:
        return continuous_cols
    else:
        return yesno_cols, continuous_cols
    

integerized_embedding_columns = get_subcolumns_by_tag(_id_tag)
unlabeled_embedding_columns = ["ecology", "sub_area"]

embedding_columns = integerized_embedding_columns + unlabeled_embedding_columns


# Note: This will not modify the existing feature column(s).
# Rather, it will create a new supplemental feature column with bucketization
# Also, bucketized columns will be one-hot encoded, after being label encoded...
bucketize_columns = [
    # Specific columns to bucketize
]# + get_subcolumns_by_tag(_count_tag) + get_subcolumns_by_tag(_km_tag) + get_subcolumns_by_tag(_sq_tag)

label_columns = [
    # Specific label columns to include...    
] + get_raion_columns(return_yesno=True) + get_subcolumns_by_tag(_1line_tag) + unlabeled_embedding_columns

processed_feature_tag = "_normalized_feature"
bucketized_feature_tag = "_bucketized" + processed_feature_tag

def get_bucketized_tagged_feature_columns(cols):
    fts = []
    for c in cols:
        fts.append(c + bucketized_feature_tag)
    return fts

one_hot_columns = [
    'material',
    'state_refractored',
    'product_type',
    'culture_objects_top_25',
    
] + get_bucketized_tagged_feature_columns(bucketize_columns)

unknown_label = "_unknown_val"

exclude_normalize_continuous_columns = [
    # Specific columns to exclude from normalization...
    # TODO: Should we exclude timestamp_month, day, and year??
] + one_hot_columns + label_columns + integerized_embedding_columns


normalize_continuous_columns = [c for c in all_feature_columns if c not in exclude_normalize_continuous_columns]
all_continuous_columns = [c for c in all_feature_columns if c not in one_hot_columns and c not in label_columns]

In [256]:
final_train_df = final_train_df[all_feature_columns + [LABEL]]

In [257]:
final_train_df.head()

Unnamed: 0,sport_count_5000,railroad_station_avto_min,mosque_count_500,cafe_count_5000_price_2500,trc_sqm_3000,big_road1_1line,build_count_brick,office_count_1500,healthcare_centers_raion,housing_fund_sqm,...,cafe_count_3000_price_500,perinatal_mort_per_1000_cap,load_of_teachers_preschool_per_teacher,cpi,church_count_5000,product_type,rent_price_3room_bus,cafe_count_5000_price_500,fin_res_per_cap,price_doc
0,52,6.905893,0,9,1419204,no,0.0,3,1,218.0,...,21,5.53,793.319561,354.0,22,Investment,77.93,39,226.214157,5850000
1,66,4.679745,0,15,491565,no,67.0,3,1,218.0,...,11,5.53,793.319561,354.0,29,Investment,77.93,49,226.214157,6000000
2,67,1.70142,0,10,52550,no,206.0,0,1,218.0,...,9,5.53,793.319561,354.0,27,Investment,77.93,29,226.214157,5700000
3,26,5.271136,0,11,205756,no,124.0,2,1,218.0,...,5,5.53,793.319561,353.2,4,Investment,94.02,7,226.214157,13100000
4,195,2.156284,0,319,2296870,no,643.0,93,4,218.0,...,266,5.53,793.319561,353.2,236,Investment,94.02,566,226.214157,16331452


In [258]:
final_train_df = final_train_df.convert_objects(convert_numeric=True)

  if __name__ == '__main__':


In [259]:
final_train_df.fillna(0, inplace=True)

## Prepare numeric data scalers & normalizers

## TODO: Convert all embedding columns to integers (some are float labels). Decide when/where to do this, and how!

### Bucketize columns

In [260]:
for column in bucketize_columns:
    final_train_df[column + "_bucketized" + processed_feature_tag] = bucketize_column(final_train_df, column)

### Scale continuous columns

In [261]:
norm_scalers = {column : preprocessing.StandardScaler() for column in normalize_continuous_columns}

In [262]:
len(norm_scalers)

368

### One-hot encoding columns

In [263]:
ont_hot_scalers = {column : (preprocessing.LabelEncoder(), preprocessing.OneHotEncoder(sparse=False, handle_unknown = 'ignore')) for column in one_hot_columns}

In [264]:
len(ont_hot_scalers)

4

### Label encoding columns

In [265]:
label_scalers = {column : preprocessing.LabelEncoder() for column in label_columns}

In [266]:
len(label_scalers)

13

## Apply scaling and normalization to the data

In [267]:
known_values = {}

In [268]:
for column in normalize_continuous_columns:
    c_scaler = norm_scalers[column]
    try:
        c_scaler.fit(final_train_df[[column]].values)
    except:
        print 'failed: ', column

In [269]:
for column in one_hot_columns:
    label_enc, one_hot_enc = ont_hot_scalers[column]
    known_values[column] = final_train_df[column].unique().tolist()
    if column not in bucketize_columns:
        label_enc.fit(final_train_df[column].values.tolist() + [unknown_label])
        r = label_enc.transform(final_train_df[column].values.tolist())
        r = np.expand_dims(r, axis=1)
    else:
        r = np.expand_dims(final_train_df[[column]].values, axis=1)
    one_hot_enc.fit(r)

In [270]:
for column in label_columns:
    known_values[column] = final_train_df[column].unique().tolist()
    label_enc = label_scalers[column]
    label_enc.fit(final_train_df[column].values.tolist() + [unknown_label])

### Test the scalers...

In [271]:
norm_scalers['timestamp_day'].transform([final_train_df.ix[0]['timestamp_day']])



array([ 0.40933319])

In [272]:
ont_hot_scalers['material'][1].transform([final_train_df.ix[70]['material']])



array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [273]:
label_scalers['sub_area'].transform([final_train_df.ix[0]['sub_area']])

array([9])

In [274]:
# According to https://www.tensorflow.org/get_started/embedding_viz#metadata,
# if the file only has a single column, then they DON'T assume a header row.
# Instead they assume that each line is one vocabulary...
sub_area_metadata = open('/mnt/h/Kaggle/Competitions/Russian Bank/models/metadata/sub_area_metadata.tsv', 'w+')
sub_area_metadata.write("\n".join([label_scalers['sub_area'].inverse_transform(i) for i in range(146)]))
sub_area_metadata.close()

### Save them to disk

In [275]:
from sklearn.externals import joblib

In [276]:
joblib.dump(norm_scalers, '/mnt/h/Kaggle/Competitions/Russian Bank/code/data/norm_scalers_without_bucketized.pkl') 

['/mnt/h/Kaggle/Competitions/Russian Bank/code/data/norm_scalers_without_bucketized.pkl']

In [277]:
joblib.dump(ont_hot_scalers, '/mnt/h/Kaggle/Competitions/Russian Bank/code/data/one_hot_scalers_without_bucketized.pkl') 

['/mnt/h/Kaggle/Competitions/Russian Bank/code/data/one_hot_scalers_without_bucketized.pkl']

In [281]:
joblib.dump(label_scalers, '/mnt/h/Kaggle/Competitions/Russian Bank/code/data/label_scalers_without_bucketized.pkl') 

['/mnt/h/Kaggle/Competitions/Russian Bank/code/data/label_scalers_without_bucketized.pkl']

### Transform the whole training data

In [282]:
len(known_values['sub_area'])

146

In [283]:
def transform_bucketized_values(df):
    global bucketize_columns, processed_feature_tag
    for column in bucketize_columns:
        df[column + "_bucketized" + processed_feature_tag] = bucketize_column(df, column)
    return df

In [284]:
def transform_continuous_values(df):
    global normalize_continuous_columns, processed_feature_tag, norm_scalers
    for continuous_norm_col in normalize_continuous_columns:
        scaler = norm_scalers[continuous_norm_col]
        scaled_values = scaler.transform(df[[continuous_norm_col]].values)
        df[continuous_norm_col + processed_feature_tag] = scaled_values
    return df

In [285]:
def transform_onehot_values(df):
    global one_hot_columns, one_hot_scalers, bucketize_columns, unknown_label, processed_feature_tag, known_values
    for one_hot_col in one_hot_columns:
        l_scaler = ont_hot_scalers[one_hot_col][0]
        scaler = ont_hot_scalers[one_hot_col][1]
        if one_hot_col not in bucketize_columns:
            vals = [c if c in known_values[one_hot_col] else unknown_label for c in df[[one_hot_col]].values.ravel().tolist() ]
            r = l_scaler.transform(vals)
        else:
            r = df[[one_hot_col]].values
        r = np.expand_dims(r, axis=1)
        scaled_values = scaler.transform(r)
        df[one_hot_col + processed_feature_tag] = list(scaled_values)
    return df

In [286]:
def transform_label_values(df):
    global label_columns, label_scalers, processed_feature_tag, unknown_label
    for label_col in label_columns:
        label_enc = label_scalers[label_col]
        vals = [c if c in known_values[label_col] else unknown_label for c in df[[label_col]].values.ravel().tolist() ] 
        encoded_values = label_enc.transform(vals)
        df[label_col + processed_feature_tag] = encoded_values
    return df

In [287]:
def transform_dataset(df):
    df = transform_bucketized_values(df)
    df = transform_continuous_values(df)
    df = transform_onehot_values(df)
    df = transform_label_values(df)
    return df

In [288]:
final_train_df = transform_dataset(final_train_df)

In [289]:
def extract_normalized_feature(df, feature_name):
    global integerized_embedding_columns
    if feature_name in integerized_embedding_columns:
        name = feature_name
    else:
        name = feature_name + processed_feature_tag
    return np.array(df[name].values.tolist())

In [50]:
# To get a bucketized column, do this
extract_normalized_feature(final_train_df, get_bucketized_tagged_feature_columns(["full_sq"])[0])

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [290]:
extract_normalized_feature(final_train_df, "life_sq")

array([-0.00404841, -0.16882931,  0.03714682, ..., -0.56018395,
        0.09893965,  0.0165492 ])

In [291]:
extract_normalized_feature(final_train_df, "sub_area")

array([  9,  70, 129, ..., 103,  77,  75])

In [292]:
def extract_categorical_features(df, include_embedding_cols = True):
    global one_hot_columns, label_columns
    vals = {}
    for c in one_hot_columns + label_columns:
        if c in embedding_columns:
            if include_embedding_cols:
                vals[c]  = np.expand_dims(extract_normalized_feature(df, c), 1)
        else:
            if c not in one_hot_columns:
                vals[c]  = np.expand_dims(extract_normalized_feature(df, c), 1)
            else:
                vals[c]  = extract_normalized_feature(df, c)
    return vals

In [293]:
def extract_embedding_features(df):
    global embedding_columns
    vals = {}
    for c in embedding_columns:
        vals[c]  = np.expand_dims(extract_normalized_feature(df, c), axis=1)
    return vals

In [294]:
def extract_normalized_features(df):
    fts = []
    for col in normalize_continuous_columns:
        fts.append(np.expand_dims(extract_normalized_feature(df, col), axis=1))
    return np.concatenate(tuple(fts), axis=1)

## Prepare training and testing datasets

In [295]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2):
    perm = np.random.permutation(df.index)
    m = len(df)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.ix[perm[:train_end]]
    validate = df.ix[perm[train_end:validate_end]]
    test = df.ix[perm[validate_end:]]
    return train, validate, test

In [296]:
train_split_df, test_split_df = train_test_split(final_train_df, test_size = 0.1)

In [297]:
train_split_df_continuous_features = extract_normalized_features(train_split_df)
test_split_df_continuous_features = extract_normalized_features(test_split_df)

In [298]:
train_split_df_continuous_features.shape, test_split_df_continuous_features.shape

((27423, 368), (3048, 368))

In [299]:
train_split_df_categorical_features = extract_categorical_features(train_split_df, include_embedding_cols=False)
test_split_df_categorical_features = extract_categorical_features(test_split_df, include_embedding_cols=False)

In [300]:
train_split_df_embedding_features = extract_embedding_features(train_split_df)
test_split_df_embedding_features = extract_embedding_features(test_split_df)

In [301]:
train_split_df_embedding_features

{'ID_big_road1': array([[14],
        [29],
        [ 5],
        ..., 
        [11],
        [13],
        [ 1]]), 'ID_big_road2': array([[ 8],
        [35],
        [30],
        ..., 
        [55],
        [27],
        [23]]), 'ID_bus_terminal': array([[ 5],
        [ 1],
        [ 5],
        ..., 
        [14],
        [ 8],
        [ 6]]), 'ID_metro': array([[ 77],
        [122],
        [ 90],
        ..., 
        [108],
        [ 21],
        [ 36]]), 'ID_railroad_station_avto': array([[12],
        [26],
        [64],
        ..., 
        [75],
        [24],
        [16]]), 'ID_railroad_station_walk': array([[ 12.],
        [ 26.],
        [ 64.],
        ..., 
        [ 75.],
        [ 24.],
        [ 16.]]), 'ID_railroad_terminal': array([[121],
        [ 83],
        [121],
        ..., 
        [  5],
        [ 50],
        [  5]]), 'ecology': array([[4],
        [5],
        [4],
        ..., 
        [2],
        [3],
        [1]]), 'sub_area': array([[ 32],
        [

In [302]:
def combine_inputs(features, axis=1):
    fts = []
    for f in features:
        if len(f.shape) == 1:
            fts.append(np.expand_dims(f, axis=1))
        else:
            fts.append(f)
    return np.concatenate(tuple(fts), axis=axis)

In [303]:
train_split_df_combined_inputs = combine_inputs([train_split_df_continuous_features] +  train_split_df_categorical_features.values())

In [304]:
test_split_df_combined_inputs = combine_inputs([test_split_df_continuous_features] +  test_split_df_categorical_features.values())

In [305]:
train_split_df_combined_inputs.shape, test_split_df_combined_inputs.shape

((27423, 396), (3048, 396))

In [306]:
num_main_features = train_split_df_combined_inputs.shape[1]
'Total number of main continuous input features: ', num_main_features

('Total number of main continuous input features: ', 396)

In [307]:
def prepare_embedding_model_header(train_embedding_features, test_embedding_features):
    e_dict = {}
    for e_name, e_in in train_embedding_features.items():
        vocab_size = test_embedding_features[e_name].max()
        vocab_size = max(vocab_size, e_in.max())
        e_dict[e_name] = (determine_dimensions(vocab_size), int(vocab_size) + 100) # the 100 is a safe margin for unseen ids...
    return e_dict

## Model

In [310]:
"""
The main model, version: 2.1
""" 
def main_model_v2_1(num_continuous_features, embedding_columns = {}, output_activation='relu'):
    """
    embedding_columns is a dict consisting of embedding_column_name => (embedding_dim, vocab_size)
    """
    with tf.name_scope("real_estate_price_model"):
        
        # Inputs
        with tf.name_scope("inputs"):
            ## Continuous
            main_input = Input(shape=(num_continuous_features,), name="main_input")
            
            ## Categorical (labelled)
            embedding_inputs = {}
            embedding_input_names = []
            embedding_input_objects = []
            for embedding_col_name, (embedding_dim, embedding_size) in embedding_columns.items():
                e_input = Input(shape=(1,), dtype='int32', name=embedding_col_name + '_input')
                embedding_input_objects.append(e_input)
                embedding_inputs[embedding_col_name] = (e_input, embedding_dim, embedding_size)
                embedding_input_names.append(embedding_col_name + '_input')
                
        embedded_vectors = []
        for embedding_col_name, (embedding_input, embedding_dim, embedding_size) in embedding_inputs.items():
            with tf.name_scope(embedding_col_name + "_embedding"):
                e_vector = Embedding(output_dim = embedding_dim, input_dim = embedding_size, input_length = 1, name=embedding_col_name + "_embedding")(embedding_input)
                e_vector = Flatten()(e_vector)
                embedded_vectors.append(e_vector)
                            
        with tf.name_scope("fully_connected"):
            ## Concatenate all suboutput vectors into one big vector
            merged_input_vector = Concatenate()([main_input] + embedded_vectors)
            merged_input_vector = GaussianNoise(0.01)(merged_input_vector)      
        
            ## Some final hidden layers
            #merged_output_vector = GaussianDropout(0.4)(merged_output_vector)
            merged_output_vector = Dense(512, activation='relu')(merged_input_vector)
            merged_output_vector = GaussianDropout(0.5)(merged_output_vector) # Batch normalize this combined pre-output vector
            merged_output_vector = Dense(512, activation='relu')(merged_output_vector)
            merged_output_vector = Dropout(0.5)(merged_output_vector) # Batch normalize this combined pre-output vector
            #merged_output_vector = Dropout(0.4)(merged_output_vector)
            merged_output_vector = Dense(256, activation='relu')(merged_output_vector)
            merged_output_vector = Dropout(0.4)(merged_output_vector) # Batch normalize this combined pre-output vector
            #merged_output_vector = Dropout(0.4)(merged_output_vector)
            merged_output_vector = Dense(128, activation='relu')(merged_output_vector)
            merged_output_vector = Dropout(0.25)(merged_output_vector) # Batch normalize this combined pre-output vector
    
            main_output = Dense(1, activation=output_activation, name="main_output")(merged_output_vector)
        
        return [main_input] + embedding_input_objects, main_output

In [311]:
embedding_header = prepare_embedding_model_header(train_split_df_embedding_features, test_split_df_embedding_features)
inputs, outputs = main_model_v2_1(num_main_features, embedding_header)

real_estate_price_model = Model(inputs=inputs, outputs=outputs)

In [312]:
from IPython.display import Image, display, SVG
from keras.utils.vis_utils import model_to_dot

# Show the model in ipython notebook
SVG(model_to_dot(real_estate_price_model).create(prog='dot', format='svg'))

OSError: [Errno 12] Cannot allocate memory

In [None]:
# Save the model as png file
from keras.utils.vis_utils import plot_model
plot_model(real_estate_price_model, to_file='/mnt/h/Kaggle/Competitions/Russian Bank/models/'+version_str+'/model.png', show_shapes=True)

In [None]:
real_estate_price_model.summary()

In [313]:
def root_mean_squared_logarithmic_error(y_true, y_pred):
    y_pred_log = K.log(y_pred + 1.)
    y_true_log = K.log(y_true + 1.)
    return K.sqrt(K.mean(K.square(y_pred_log - y_true_log), axis = -1))

In [314]:
real_estate_price_model.compile(optimizer = "adam", 
                          loss = root_mean_squared_logarithmic_error
                         )

### Train the model

In [315]:
epochs = 100

In [316]:
checkpointer = ModelCheckpoint(filepath="/mnt/h/Kaggle/Competitions/Russian Bank/models/" + version_str + "/" + "weights.{epoch:02d}-{val_loss:.2f}.hdf5", verbose=1, save_best_only=True)
tensorboard = TensorBoard(log_dir="/mnt/h/Kaggle/Competitions/Russian Bank/models/" + version_str + "/logs/",
                          histogram_freq=0, 
                          write_graph=True, 
                         # write_grads=True, 
                          write_images=False, 
                          embeddings_freq=5, 
                          embeddings_layer_names=[
                              'sub_area_embedding'
                          ], embeddings_metadata={
                              'sub_area_embedding':'/mnt/h/Kaggle/Competitions/Russian Bank/models/metadata/sub_area_metadata.tsv'
                          })
early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='auto')
callbacks = [checkpointer, tensorboard, early_stopping]

In [317]:
train_split_df_embedding_features_final = {}
for k, v in train_split_df_embedding_features.items():
    train_split_df_embedding_features_final[k + "_input"] = v

In [318]:
test_split_df_embedding_features_final = {}
for k, v in test_split_df_embedding_features.items():
    test_split_df_embedding_features_final[k + "_input"] = v

In [319]:
real_estate_price_model.fit(x=dict({
                    'main_input' : train_split_df_combined_inputs
                }.items() + train_split_df_embedding_features_final.items()),
                y = train_split_df['price_doc'].values,
                callbacks=callbacks,
                verbose = 2,
                validation_split=0.25,
                shuffle=True,
                epochs=epochs)

Train on 20567 samples, validate on 6856 samples
Epoch 1/100
Epoch 00000: val_loss improved from inf to 0.36732, saving model to /mnt/h/Kaggle/Competitions/Russian Bank/models/v2.1/weights.00-0.37.hdf5
88s - loss: 1.8101 - val_loss: 0.3673
Epoch 2/100
Epoch 00001: val_loss improved from 0.36732 to 0.32972, saving model to /mnt/h/Kaggle/Competitions/Russian Bank/models/v2.1/weights.01-0.33.hdf5
39s - loss: 0.3673 - val_loss: 0.3297
Epoch 3/100
Epoch 00002: val_loss improved from 0.32972 to 0.29471, saving model to /mnt/h/Kaggle/Competitions/Russian Bank/models/v2.1/weights.02-0.29.hdf5
52s - loss: 0.3376 - val_loss: 0.2947
Epoch 4/100
Epoch 00003: val_loss improved from 0.29471 to 0.26822, saving model to /mnt/h/Kaggle/Competitions/Russian Bank/models/v2.1/weights.03-0.27.hdf5
51s - loss: 0.3090 - val_loss: 0.2682
Epoch 5/100
Epoch 00004: val_loss improved from 0.26822 to 0.26066, saving model to /mnt/h/Kaggle/Competitions/Russian Bank/models/v2.1/weights.04-0.26.hdf5
42s - loss: 0.2949

KeyboardInterrupt: 

In [320]:
real_estate_price_model.load_weights('/mnt/h/Kaggle/Competitions/Russian Bank/models/v'+str(version) + '/weights.04-0.26.hdf5')

In [321]:
real_estate_price_model.save('/mnt/h/Kaggle/Competitions/Russian Bank/models/v'+str(version) + '/trained_model.04-0.26.hdf5')

In [322]:
train_loss = real_estate_price_model.evaluate(x=dict({
                    'main_input' : train_split_df_combined_inputs
                }.items() + train_split_df_embedding_features_final.items()),
                y = train_split_df['price_doc'].values,
                      verbose = 2)
'Train loss: ', train_loss

('Train loss: ', 0.25760209806220247)

In [323]:
test_loss = real_estate_price_model.evaluate(x=dict({
                    'main_input' : test_split_df_combined_inputs
                }.items() + test_split_df_embedding_features_final.items()),
                y = test_split_df['price_doc'].values,
                      verbose = 2)
'Test loss: ', test_loss

('Test loss: ', 0.25061288374302582)

In [86]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

In [87]:
seed = 7
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [88]:
kfold.get_n_splits(final_train_df, final_train_df[[LABEL]].values)

10

In [None]:
eval_loss_scores = []

for idx, (train, test) in enumerate(kfold.split(final_train_df, final_train_df[[LABEL]].values.ravel())):
    
    temp_x, temp_y = final_train_df.ix[train, :], final_train_df.ix[train, :][LABEL].values
    temp_test_x, temp_test_y = final_train_df.ix[test, :], final_train_df.ix[test, :][LABEL].values
    
    temp_train_split_df_continuous_features = extract_normalized_features(temp_x)
    temp_test_split_df_continuous_features = extract_normalized_features(temp_test_x)

    temp_train_split_df_categorical_features = extract_categorical_features(temp_x, include_embedding_cols=False)
    temp_test_split_df_categorical_features = extract_categorical_features(temp_test_x, include_embedding_cols=False)
    
    temp_train_split_df_embedding_features = extract_embedding_features(temp_x)
    temp_test_split_df_embedding_features = extract_embedding_features(temp_test_x)
    
    temp_train_split_df_embedding_features_final = {}
    for k, v in temp_train_split_df_embedding_features.items():
        temp_train_split_df_embedding_features_final[k + "_input"] = v
    
    temp_test_split_df_embedding_features_final = {}
    for k, v in temp_test_split_df_embedding_features.items():
        temp_test_split_df_embedding_features_final[k + "_input"] = v
        
    temp_train_split_df_combined_inputs = combine_inputs([temp_train_split_df_continuous_features] 
                                                         +  temp_train_split_df_categorical_features.values())
        
    temp_test_split_df_combined_inputs = combine_inputs([temp_test_split_df_continuous_features] 
                                                         +  temp_test_split_df_categorical_features.values())
    
    print 'Cross validating.... step: ', idx
    
    temp_train_loss = real_estate_price_model.evaluate(x=dict({
                    'main_input' : temp_train_split_df_combined_inputs
                }.items() + temp_train_split_df_embedding_features_final.items()),
                y = temp_y,
                      verbose = 2)
    print 'Temp train loss: ', temp_train_loss, ' index: ', idx
    eval_loss_scores.append(temp_train_loss)
    
    temp_test_loss = real_estate_price_model.evaluate(x=dict({
                    'main_input' : temp_test_split_df_combined_inputs
                }.items() + temp_test_split_df_embedding_features_final.items()),
                y = temp_test_y,
                      verbose = 2)
    print 'Temp test loss: ', temp_test_loss, ' index: ', idx
    eval_loss_scores.append(temp_test_loss)
    print '\n\n'
    
print eval_loss_scores



Cross validating.... step:  0
Temp train loss:  0.218764170843  index:  0
Temp test loss:  0.210351072047  index:  0



Cross validating.... step: 


KeyboardInterrupt



 1


In [253]:
print("%.2f%% (+/- %.2f%%)" % (np.mean(eval_loss_scores), np.std(eval_loss_scores)))

0.25% (+/- 0.00%)


In [1]:
from keras.models import load_model

Using TensorFlow backend.


In [80]:
real_estate_price_model_duplicate = load_model('/mnt/h/Kaggle/Competitions/Russian Bank/models/v'+str(version) + '/trained_model.27-0.22.hdf5', custom_objects={"root_mean_squared_logarithmic_error":root_mean_squared_logarithmic_error})

ValueError: Optimizer weight shape (1024,) not compatible with provided weight shape (105, 2)

In [324]:
test_preds = real_estate_price_model.predict(x=dict({
                    'main_input' : test_split_df_combined_inputs
                }.items() + test_split_df_embedding_features_final.items()))
test_actuals = test_split_df[:10].price_doc.values

In [325]:
zip(test_actuals, test_preds.ravel())

[(8400000, 10090224.0),
 (3488177, 4078434.5),
 (4700000, 6572524.0),
 (8600000, 7172507.5),
 (4649400, 4842039.5),
 (1000000, 5185049.5),
 (3239735, 3047429.0),
 (10800000, 9274124.0),
 (4940760, 4281653.5),
 (4722360, 3708248.2)]

## Generate Kaggle Submission file

In [326]:
kaggle_test = pd.read_csv("/mnt/h/Kaggle/Competitions/Russian Bank/data/test.csv/test.csv")

In [327]:
kaggle_test.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,30474,2015-07-01,39.0,20.7,2,9,1,1998.0,1,8.9,...,8,0,0,0,1,10,1,0,14,1
1,30475,2015-07-01,79.2,,8,17,1,0.0,3,1.0,...,4,1,1,0,2,11,0,1,12,1
2,30476,2015-07-01,40.5,25.1,3,5,2,1960.0,2,4.8,...,42,11,4,0,10,21,0,10,71,11
3,30477,2015-07-01,62.8,36.0,17,17,1,2016.0,2,62.8,...,1,1,2,0,0,10,0,0,2,0
4,30478,2015-07-01,40.0,40.0,17,17,1,0.0,1,1.0,...,5,1,1,0,2,12,0,1,11,1


In [328]:
macro_df = pd.read_csv("/mnt/h/Kaggle/Competitions/Russian Bank/data/macro.csv/macro.csv")

In [329]:
final_kaggle_df = pd.merge(kaggle_test, macro_df, on='timestamp')

In [330]:
final_kaggle_df = prepare_dataset(final_kaggle_df)

In [331]:
final_kaggle_df.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,bandwidth_sports,population_reg_sports_share,students_reg_sports_share,apartment_build,apartment_fund_sqm,timestamp_converted,timestamp_day,timestamp_month,timestamp_year,state_refractored
0,30474,2015-07-01,39.0,20.7,2,9,1,1998.0,1,8.9,...,463938.0,0.0,0.0,0.0,234576.9,2015-07-01,1,7,2015,3
1,30475,2015-07-01,79.2,0.0,8,17,1,0.0,3,1.0,...,463938.0,0.0,0.0,0.0,234576.9,2015-07-01,1,7,2015,1
2,30476,2015-07-01,40.5,25.1,3,5,2,1960.0,2,4.8,...,463938.0,0.0,0.0,0.0,234576.9,2015-07-01,1,7,2015,2
3,30477,2015-07-01,62.8,36.0,17,17,1,2016.0,2,62.8,...,463938.0,0.0,0.0,0.0,234576.9,2015-07-01,1,7,2015,3
4,30478,2015-07-01,40.0,40.0,17,17,1,0.0,1,1.0,...,463938.0,0.0,0.0,0.0,234576.9,2015-07-01,1,7,2015,1


In [332]:
final_kaggle_df = final_kaggle_df[all_feature_columns ]

In [231]:
[c for c in final_kaggle_df.columns.tolist() if "normalized" in c]

['sport_count_5000_bucketized_normalized_feature',
 'mosque_count_500_bucketized_normalized_feature',
 'cafe_count_5000_price_2500_bucketized_normalized_feature',
 'build_count_brick_bucketized_normalized_feature',
 'office_count_1500_bucketized_normalized_feature',
 'office_count_500_bucketized_normalized_feature',
 'cafe_count_1000_price_1000_bucketized_normalized_feature',
 'build_count_foam_bucketized_normalized_feature',
 'cafe_count_3000_price_1000_bucketized_normalized_feature',
 'leisure_count_1000_bucketized_normalized_feature',
 'cafe_count_1500_price_1500_bucketized_normalized_feature',
 'mosque_count_2000_bucketized_normalized_feature',
 'trc_count_500_bucketized_normalized_feature',
 'leisure_count_2000_bucketized_normalized_feature',
 'trc_count_2000_bucketized_normalized_feature',
 'cafe_count_500_price_1500_bucketized_normalized_feature',
 'cafe_count_3000_price_1500_bucketized_normalized_feature',
 'build_count_after_1995_bucketized_normalized_feature',
 'cafe_count_50

In [333]:
final_kaggle_df['old_education_build_share'] = final_kaggle_df.apply(lambda row: int(str(row['old_education_build_share']).replace(",","")), axis=1)

In [334]:
final_kaggle_df['modern_education_share'] = final_kaggle_df.apply(lambda row: int(str(row['modern_education_share']).replace(",","")), axis=1)

In [335]:
default_product_type = "Investment"

In [336]:
final_kaggle_df['product_type'] = final_kaggle_df.apply(lambda row: row['product_type'] if row['product_type'] != 0 else default_product_type, axis=1)

In [337]:
final_kaggle_df = transform_dataset(final_kaggle_df)

In [198]:
final_kaggle_df[[c for c in final_kaggle_df.columns.tolist() if "normalized" in c]]

Unnamed: 0,sport_count_5000_bucketized_normalized_feature,mosque_count_500_bucketized_normalized_feature,cafe_count_5000_price_2500_bucketized_normalized_feature,build_count_brick_bucketized_normalized_feature,office_count_1500_bucketized_normalized_feature,office_count_500_bucketized_normalized_feature,cafe_count_1000_price_1000_bucketized_normalized_feature,build_count_foam_bucketized_normalized_feature,cafe_count_3000_price_1000_bucketized_normalized_feature,leisure_count_1000_bucketized_normalized_feature,...,nuclear_reactor_raion_normalized_feature,railroad_terminal_raion_normalized_feature,incineration_raion_normalized_feature,big_market_raion_normalized_feature,thermal_power_plant_raion_normalized_feature,big_road1_1line_normalized_feature,water_1line_normalized_feature,railroad_1line_normalized_feature,ecology_normalized_feature,sub_area_normalized_feature
0,1,0,0,2,0,0,0,1,1,0,...,1,1,1,1,1,1,1,1,5,38
1,1,0,1,0,0,0,1,0,1,0,...,1,1,1,1,1,1,1,1,3,103
2,2,0,1,2,1,0,1,0,1,1,...,1,1,1,1,2,1,1,1,4,84
3,1,0,1,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,3,105
4,1,0,1,0,0,0,1,0,1,0,...,1,1,1,1,1,1,1,1,3,103
5,3,0,2,2,1,1,1,0,1,0,...,1,1,1,1,1,1,1,2,5,24
6,1,0,1,1,1,0,1,0,1,1,...,1,1,1,1,1,1,1,1,2,124
7,1,0,1,0,0,0,0,0,1,0,...,1,1,1,2,1,1,1,1,3,102
8,1,0,1,2,0,0,0,0,1,0,...,1,1,1,1,1,1,1,1,3,136
9,1,0,1,0,0,0,0,0,1,0,...,1,1,1,1,1,1,1,1,3,103


In [199]:
[c for c in final_kaggle_df.columns.tolist() if "normalized" in c]

['sport_count_5000_bucketized_normalized_feature',
 'mosque_count_500_bucketized_normalized_feature',
 'cafe_count_5000_price_2500_bucketized_normalized_feature',
 'build_count_brick_bucketized_normalized_feature',
 'office_count_1500_bucketized_normalized_feature',
 'office_count_500_bucketized_normalized_feature',
 'cafe_count_1000_price_1000_bucketized_normalized_feature',
 'build_count_foam_bucketized_normalized_feature',
 'cafe_count_3000_price_1000_bucketized_normalized_feature',
 'leisure_count_1000_bucketized_normalized_feature',
 'cafe_count_1500_price_1500_bucketized_normalized_feature',
 'mosque_count_2000_bucketized_normalized_feature',
 'trc_count_500_bucketized_normalized_feature',
 'leisure_count_2000_bucketized_normalized_feature',
 'trc_count_2000_bucketized_normalized_feature',
 'cafe_count_500_price_1500_bucketized_normalized_feature',
 'cafe_count_3000_price_1500_bucketized_normalized_feature',
 'build_count_after_1995_bucketized_normalized_feature',
 'cafe_count_50

In [338]:
extract_normalized_feature(final_kaggle_df, "sub_area")

array([ 38, 103,  84, ..., 134,  79,  19])

In [339]:
kaggle_split_df_continuous_features = extract_normalized_features(final_kaggle_df)
kaggle_split_df_categorical_features = extract_categorical_features(final_kaggle_df, include_embedding_cols=False)
kaggle_split_df_embedding_features = extract_embedding_features(final_kaggle_df)
    
kaggle_split_df_embedding_features_final = {}
for k, v in kaggle_split_df_embedding_features.items():
    kaggle_split_df_embedding_features_final[k + "_input"] = v
       
kaggle_split_df_combined_inputs = combine_inputs([kaggle_split_df_continuous_features] 
                                                     +  kaggle_split_df_categorical_features.values())

In [340]:
kaggle_split_df_categorical_features

{'big_market_raion': array([[1],
        [1],
        [1],
        ..., 
        [1],
        [2],
        [1]]), 'big_road1_1line': array([[1],
        [1],
        [1],
        ..., 
        [1],
        [1],
        [1]]), 'culture_objects_top_25': array([[ 1.,  0.],
        [ 1.,  0.],
        [ 1.,  0.],
        ..., 
        [ 0.,  1.],
        [ 1.,  0.],
        [ 1.,  0.]]), 'detention_facility_raion': array([[1],
        [1],
        [1],
        ..., 
        [2],
        [1],
        [1]]), 'incineration_raion': array([[1],
        [1],
        [1],
        ..., 
        [1],
        [1],
        [1]]), 'material': array([[ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  1., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  1.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  1.,  0.],
        [ 0.,  1.,  0., ...,  0.,  0.,  0.]]), 'nuclear_reactor_raion': array([[1],
        [1],
        [1],
        ..., 
   

In [341]:
kaggle_split_df_embedding_features

{'ID_big_road1': array([[ 2],
        [13],
        [10],
        ..., 
        [ 4],
        [23],
        [ 2]]), 'ID_big_road2': array([[38],
        [27],
        [ 1],
        ..., 
        [34],
        [52],
        [40]]), 'ID_bus_terminal': array([[ 9],
        [ 8],
        [ 3],
        ..., 
        [13],
        [ 6],
        [ 2]]), 'ID_metro': array([[ 45],
        [ 21],
        [ 44],
        ..., 
        [120],
        [ 53],
        [142]]), 'ID_railroad_station_avto': array([[39],
        [24],
        [59],
        ..., 
        [32],
        [27],
        [21]]), 'ID_railroad_station_walk': array([[ 39.],
        [ 24.],
        [ 68.],
        ..., 
        [  5.],
        [ 27.],
        [ 82.]]), 'ID_railroad_terminal': array([[32],
        [50],
        [ 5],
        ..., 
        [32],
        [32],
        [32]]), 'ecology': array([[5],
        [3],
        [4],
        ..., 
        [1],
        [4],
        [4]]), 'sub_area': array([[ 38],
        [103],


In [342]:
kaggle_split_df_continuous_features

array([[-0.838086  ,  0.03928507, -0.07009939, ...,  1.89720164,
        -0.48593907, -1.83239346],
       [-0.8812901 ,  0.16129706, -0.07009939, ...,  1.89720164,
        -0.49387983, -1.83239346],
       [ 0.39323092, -0.14841033, -0.07009939, ...,  1.89720164,
        -0.10478255, -1.83239346],
       ..., 
       [ 3.24470169, -0.19664223, -0.07009939, ...,  0.24523479,
         4.54850328, -1.83239346],
       [-0.21162651,  0.01626024, -0.07009939, ...,  0.24523479,
        -0.36682766, -1.83239346],
       [-0.23322857,  0.2272611 , -0.07009939, ...,  0.24523479,
        -0.20007168, -1.83239346]])

In [343]:
vs = []
for column in one_hot_columns:
    vals = final_kaggle_df[column].values.tolist()
    vals = [c if c in known_values[column] else unknown_label for c in vals]
    if unknown_label in vals:
        vs.append((column, vals))

In [344]:
[v[0] for v in vs]

[]

In [345]:
kaggle_preds = real_estate_price_model.predict(x=dict({
                    'main_input' : kaggle_split_df_combined_inputs
                }.items() + kaggle_split_df_embedding_features_final.items()))

In [346]:
kaggle_preds

array([[  3.07068355e+10],
       [  3.07081196e+10],
       [  3.07076506e+10],
       ..., 
       [  9.80176000e+06],
       [  1.45478860e+07],
       [  1.71274320e+07]], dtype=float32)

In [191]:
final_kaggle_df['price_doc'] = kaggle_preds

In [192]:
final_kaggle_df['price_doc']

0       4.093042e+10
1       4.093284e+10
2       4.093333e+10
3       4.093235e+10
4       4.092968e+10
5       4.093546e+10
6       4.093052e+10
7       4.092911e+10
8       4.093080e+10
9       4.093024e+10
10      4.093242e+10
11      4.092887e+10
12      4.092848e+10
13      4.092789e+10
14      4.093016e+10
15      4.093108e+10
16      4.093927e+10
17      4.093849e+10
18      4.093026e+10
19      4.093700e+10
20      4.093147e+10
21      4.093658e+10
22      4.093368e+10
23      4.093388e+10
24      4.093059e+10
25      4.093547e+10
26      4.093610e+10
27      4.093445e+10
28      4.092772e+10
29      4.093438e+10
            ...     
7632    1.344982e+07
7633    8.391098e+06
7634    1.654414e+07
7635    1.180624e+07
7636    6.630701e+06
7637    9.822717e+06
7638    7.509546e+06
7639    1.351762e+07
7640    1.002495e+07
7641    7.301856e+06
7642    1.362228e+07
7643    1.310399e+07
7644    1.443113e+07
7645    7.393820e+06
7646    1.267712e+07
7647    1.206970e+07
7648    1.271

In [193]:
4.093042e10

40930420000.0

In [271]:
final_kaggle_df[['id', 'price_doc']]

Unnamed: 0,id,price_doc
0,30474,5880385.00
1,30475,8747043.00
2,30476,6298690.00
3,30477,6858484.00
4,30478,5549552.00
5,30479,8727317.00
6,30480,4832307.00
7,30481,4594226.00
8,30482,5736204.00
9,30483,5148752.00


In [272]:
final_kaggle_df[['id', 'price_doc']].to_csv("/mnt/h/Kaggle/Competitions/Russian Bank/models/v" + str(version) + "/submission.csv", index=False) # output submission csv file