In [1]:
import tensorflow as tf
from utils import *
import json

tf.logging.set_verbosity(tf.logging.INFO)

In [2]:
with open(get_file('valid_app_col')) as vac:
    valid_col = json.load(vac)

with open(get_file('app_unique_voc')) as auv:
    app_unique_voc = json.load(auv)


In [3]:
len(valid_col['numerical']) + len(valid_col['categorical'])

68

In [4]:
with open(get_file('cleaned_app_train')) as cat:
    columns = cat.readline().rstrip('\n').split(',')
    columns.pop(0)


In [5]:
class BASE_FLAG:
    batch_size = 40
    buffer_size = 200000  # change to train size
    hidden_units = [100, 100, 70, 50, 60, 100, 50]
    drop_out = .3
    model_dir = get_file('model_dir2')
    export_dir = get_file('export_model_dir')
    steps = None
    dnn_optimizer = tf.train.AdamOptimizer(
        learning_rate=0.25,
        beta1=0.87,
        beta2=0.89,
        epsilon=1e-08,
    )
    model_type = 'wide'

class WIDE_FLAG(BASE_FLAG):
    model_dir = get_file('model_linear_wide')
    model_type = 'wide'

FLAG = WIDE_FLAG

In [6]:
def get_column_default(file):
    with open(file) as ofile:
        cols = ofile.readline().rstrip('\n').split(',')
        ofile.readline()
        line = ofile.readline().rstrip('\n').split(',')

    def get_default(value):
        if value.isdigit():
            return [0]
        try:
            float(value)
            return [0.0]
        except ValueError:
            return ['']
        
    return cols, [get_default(x) for x in line]

COL_DEFAULTS = get_column_default(get_file('cleaned_app_train'))


In [7]:
def parse_csv(value):
    fields = tf.decode_csv(value, COL_DEFAULTS[1])
    features = dict(zip(COL_DEFAULTS[0], fields))
    print(features.pop('SK_ID_CURR'))
    label = features.pop('TARGET')
    return features, label

def get_input_fn(mode, data=None):
    
    if data:
        key = data
    else:
        if mode == tf.estimator.ModeKeys.TRAIN:
            key = 're_app_train'
        elif mode == tf.estimator.ModeKeys.EVAL:
            key = 're_app_eval'
        else:
            key = 're_app_test'
    
    def input_fn():
        dataset = tf.data.TextLineDataset(get_file(key)).skip(1)
        if mode == tf.estimator.ModeKeys.TRAIN:
            dataset.shuffle(buffer_size=FLAG.buffer_size)
        dataset = dataset.map(parse_csv, num_parallel_calls=5)
        if mode != tf.estimator.ModeKeys.PREDICT:
            dataset = dataset.repeat()
        dataset = dataset.batch(FLAG.batch_size)
        return dataset
    
    return input_fn


In [8]:
def build_columns(num_col, voc_col, other_cat):
    """
    :num_col: numerical columns
    :voc_col: categorical columns with vocabularies
    :other_cat: other categorical columns
    """
    _num_col = {c: tf.feature_column.numeric_column(c) for c in other_cat+num_col}
#     _num_col = {c: tf.feature_column.numeric_column(c) for c in other_cat}
    _voc_col = {c: tf.feature_column.categorical_column_with_vocabulary_list(c, v) 
                for c, v in voc_col.items()}
    
    # create col for wide model
    # bucket categories for wi
    
    """
    AMT_INCOME_TOTAL, AMT_CREDIT, AMT_ANNUITY, DAYS_BIRTH
    DAYS_EMPLOYED, DAYS_REGISTRATION, DAYS_ID_PUBLISH, AMT_GOODS_PRICE
    """
    bucketize = tf.feature_column.bucketized_column
    bucket_columns = [
        bucketize(_num_col['AMT_INCOME_TOTAL'], boundaries=[
            0.01e8, 0.05e8, 0.1e8, 0.2e8, 0.5e8, 1.0e8
        ]),
        bucketize(_num_col['AMT_CREDIT'], boundaries=[
            0.005e6, 0.025e6, 0.05e6, 0.075e6, 0.1e6, 0.15e6, 0.2e6, 
            0.5e6, 0.7e6, 1.0e6, 1.2e6, 1.5e6, 1.7e6, 2.5e6, 3e6
        ]),
        bucketize(_num_col['AMT_ANNUITY'], boundaries=[
            0.005e6, 0.015e6, 0.020e6, 0.025e6, 0.035e6, 0.05e6, 
            0.065e6, 0.1e6, 0.15e6, 0.20e6
        ]),
        bucketize(_num_col['AMT_GOODS_PRICE'], boundaries=[
            0.05e6, 0.10e6, 0.25e6, 0.45e6, 0.65e6, 0.75e6, 
            1.0e6, 1.5e6, 2.2e6, 3e6
        ]),
        bucketize(_num_col['DAYS_BIRTH'], boundaries=[
            -25000, -22500, -20000, -17500, -15000, -12500, -10000
        ]),
        bucketize(_num_col['DAYS_EMPLOYED'], boundaries=[
            -0.001e6, .0e6, .05e6, .3e6
        ]),
        bucketize(_num_col['DAYS_REGISTRATION'], boundaries=[
            -.025e6, -.02e6, -.015e6, -.01e6, -.0075e6, -.005e6
        ]),
        bucketize(_num_col['DAYS_ID_PUBLISH'], boundaries=[
            -0.007e6, -0.006e6, -0.005e6, -0.004e6, -0.003e6, 
            -0.002e6, -0.001e6
        ]),
        bucketize(_num_col['CNT_CHILDREN'], boundaries=[
            x for x in range(19) 
        ]),
        bucketize(_num_col['CNT_FAM_MEMBERS'], boundaries=[
            x for x in range(20) 
        ]),
    ]
    
    crossed_columns = [ 
        tf.feature_column.crossed_column(
            [x for x in other_cat if 'FLAG_DOCUMENT' in x ], 
            hash_bucket_size=1000),
        tf.feature_column.crossed_column(
            ['LIVE_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_LIVE_REGION'], 
            hash_bucket_size=1000),
        tf.feature_column.crossed_column(
            ['REG_CITY_NOT_LIVE_CITY','REG_CITY_NOT_WORK_CITY','LIVE_CITY_NOT_WORK_CITY'], 
            hash_bucket_size=1000),
        tf.feature_column.crossed_column(
            ['NAME_HOUSING_TYPE', 'NAME_FAMILY_STATUS',  'CNT_CHILDREN'], 
            hash_bucket_size=1000),
        tf.feature_column.crossed_column(
            ['NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE', 'OCCUPATION_TYPE'], 
            hash_bucket_size=1000),
        tf.feature_column.crossed_column(
            ['CNT_FAM_MEMBERS', 'FLAG_OWN_REALTY', 'FLAG_OWN_CAR'], 
            hash_bucket_size=1000),
        tf.feature_column.crossed_column(
            ['CNT_FAM_MEMBERS', 'CNT_CHILDREN'], 
            hash_bucket_size=1000),
    ]
    
    wide_columns = list(_voc_col.values()) + crossed_columns + bucket_columns
    deep_columns = list(_num_col.values()) + [tf.feature_column.indicator_column(x) for x in _voc_col.values()]

    return wide_columns, deep_columns


In [9]:
def create_estimator_and_specs():
    run_config = tf.estimator.RunConfig(  
        session_config=tf.ConfigProto(log_device_placement=True),
        save_checkpoints_secs=300,
        save_summary_steps=100
    )
    
    # Seperate out other categorical columns that do not have a vocabulary list
    other_categories = [x for x in valid_col['categorical'] if x not in app_unique_voc]
    wc, dc = build_columns(valid_col['numerical'], app_unique_voc, other_categories)
    
    def custom_metric(labels, predictions):
        print(predictions)
        return {'iou': tf.metrics.auc(labels, predictions['class_ids'])}
    
    if FLAG.model_type == 'wide':
        estimator = tf.estimator.LinearClassifier(
            model_dir=FLAG.model_dir,
            feature_columns=wc,
            config=run_config
        )
    elif FLAG.model_type == 'deep':
        estimator = tf.estimator.DNNClassifier(
            model_dir=FLAG.model_dir,
            feature_columns=dc,
            hidden_units=FLAG.hidden_units,
            config=run_config
        )
    else:
        estimator = tf.estimator.DNNLinearCombinedClassifier(
            model_dir=FLAG.model_dir,
            config=run_config,

            linear_feature_columns=wc,

            dnn_feature_columns=dc,
            dnn_hidden_units=FLAG.hidden_units,
            dnn_optimizer=FLAG.dnn_optimizer,
    #         dnn_dropout=FLAG.drop_out,

        )
#     estimator = tf.contrib.estimator.add_metrics(estimator, custom_metric)
    
    eval_spec = tf.estimator.EvalSpec(
        input_fn=get_input_fn(tf.estimator.ModeKeys.EVAL),
        throttle_secs=600
    )
    
    train_spec = tf.estimator.TrainSpec(
        input_fn=get_input_fn(tf.estimator.ModeKeys.TRAIN), max_steps=FLAG.steps, 
    )
    
    def export_model():
        print('Exporting model')
        feature_spec = tf.feature_column.make_parse_example_spec(wc + dc)
        example_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
        estimator.export_savedmodel(FLAG.export_dir)
    
    return estimator, eval_spec, train_spec, export_model


In [10]:
mode = 'train'

def main(*args):
    estimator, eval_spec, train_spec, export_model = create_estimator_and_specs()
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    
    export_model()
    
if mode == 'train':
    tf.app.run(main=main)
    
elif mode == 'predict':
    estimator, eval_spec, train_spec, export_model = create_estimator_and_specs()
    pred_gen = estimator.predict(get_input_fn(tf.estimator.ModeKeys.PREDICT), yield_single_examples=False)
    all_pred = {}
    for i, pred in enumerate(pred_gen):
        all_pred[i] = pred

INFO:tensorflow:Using config: {'_model_dir': '/media/zadiq/ZHD/datasets/home_credit/model_linear_wide', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 300, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fcaa581cb00>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 600 secs (eval_spec.throttle_secs) or training is finished.
Tensor("DecodeCSV:0", shape=(), dtype=int32)
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:ten

INFO:tensorflow:global_step/sec: 31.3449
INFO:tensorflow:loss = 9.740937, step = 7100 (3.190 sec)
INFO:tensorflow:global_step/sec: 30.2501
INFO:tensorflow:loss = 9.815319, step = 7200 (3.306 sec)
INFO:tensorflow:global_step/sec: 30.0525
INFO:tensorflow:loss = 13.037528, step = 7300 (3.328 sec)
INFO:tensorflow:global_step/sec: 30.6777
INFO:tensorflow:loss = 10.245994, step = 7400 (3.258 sec)
INFO:tensorflow:global_step/sec: 30.2235
INFO:tensorflow:loss = 15.725181, step = 7500 (3.309 sec)
INFO:tensorflow:global_step/sec: 30.4174
INFO:tensorflow:loss = 9.466417, step = 7600 (3.287 sec)
INFO:tensorflow:global_step/sec: 30.5797
INFO:tensorflow:loss = 5.9682274, step = 7700 (3.270 sec)
INFO:tensorflow:global_step/sec: 31.6612
INFO:tensorflow:loss = 7.7799387, step = 7800 (3.158 sec)
INFO:tensorflow:global_step/sec: 30.7656
INFO:tensorflow:loss = 10.652504, step = 7900 (3.251 sec)
INFO:tensorflow:global_step/sec: 31.3692
INFO:tensorflow:loss = 7.4292808, step = 8000 (3.189 sec)
INFO:tensorfl

INFO:tensorflow:global_step/sec: 29.7292
INFO:tensorflow:loss = 7.1257744, step = 15300 (3.364 sec)
INFO:tensorflow:global_step/sec: 30.1212
INFO:tensorflow:loss = 9.411531, step = 15400 (3.320 sec)
INFO:tensorflow:global_step/sec: 30.0904
INFO:tensorflow:loss = 6.548266, step = 15500 (3.323 sec)
INFO:tensorflow:global_step/sec: 29.8555
INFO:tensorflow:loss = 8.589558, step = 15600 (3.350 sec)
INFO:tensorflow:global_step/sec: 31.7847
INFO:tensorflow:loss = 6.7609797, step = 15700 (3.146 sec)
INFO:tensorflow:global_step/sec: 30.7718
INFO:tensorflow:loss = 10.62852, step = 15800 (3.249 sec)
INFO:tensorflow:global_step/sec: 31.2632
INFO:tensorflow:loss = 11.870668, step = 15900 (3.199 sec)
INFO:tensorflow:global_step/sec: 31.7748
INFO:tensorflow:loss = 10.01857, step = 16000 (3.147 sec)
INFO:tensorflow:global_step/sec: 30.5171
INFO:tensorflow:loss = 12.09602, step = 16100 (3.277 sec)
INFO:tensorflow:global_step/sec: 30.4945
INFO:tensorflow:loss = 5.1801877, step = 16200 (3.279 sec)
INFO:t

INFO:tensorflow:loss = 9.372979, step = 21404 (3.265 sec)
INFO:tensorflow:global_step/sec: 30.1733
INFO:tensorflow:loss = 8.403551, step = 21504 (3.313 sec)
INFO:tensorflow:global_step/sec: 30.7251
INFO:tensorflow:loss = 6.699872, step = 21604 (3.255 sec)
INFO:tensorflow:global_step/sec: 30.2719
INFO:tensorflow:loss = 10.805181, step = 21704 (3.303 sec)
INFO:tensorflow:global_step/sec: 30.2531
INFO:tensorflow:loss = 9.568312, step = 21804 (3.306 sec)
INFO:tensorflow:global_step/sec: 30.803
INFO:tensorflow:loss = 12.707179, step = 21904 (3.247 sec)
INFO:tensorflow:global_step/sec: 30.759
INFO:tensorflow:loss = 6.0258675, step = 22004 (3.251 sec)
INFO:tensorflow:global_step/sec: 29.8893
INFO:tensorflow:loss = 4.1965094, step = 22104 (3.346 sec)
INFO:tensorflow:global_step/sec: 30.916
INFO:tensorflow:loss = 9.398459, step = 22204 (3.234 sec)
INFO:tensorflow:global_step/sec: 30.1306
INFO:tensorflow:loss = 8.259825, step = 22304 (3.319 sec)
INFO:tensorflow:global_step/sec: 30.127
INFO:tenso

INFO:tensorflow:global_step/sec: 29.9879
INFO:tensorflow:loss = 7.339678, step = 29604 (3.335 sec)
INFO:tensorflow:global_step/sec: 30.3901
INFO:tensorflow:loss = 16.534, step = 29704 (3.291 sec)
INFO:tensorflow:global_step/sec: 30.0154
INFO:tensorflow:loss = 13.138011, step = 29804 (3.332 sec)
INFO:tensorflow:global_step/sec: 30.0082
INFO:tensorflow:loss = 7.7453184, step = 29904 (3.333 sec)
INFO:tensorflow:global_step/sec: 29.744
INFO:tensorflow:loss = 7.8653, step = 30004 (3.362 sec)
INFO:tensorflow:global_step/sec: 30.1829
INFO:tensorflow:loss = 7.395422, step = 30104 (3.313 sec)
INFO:tensorflow:global_step/sec: 29.7308
INFO:tensorflow:loss = 7.9924936, step = 30204 (3.363 sec)
INFO:tensorflow:global_step/sec: 31.5414
INFO:tensorflow:loss = 7.6239166, step = 30304 (3.172 sec)
INFO:tensorflow:global_step/sec: 30.6887
INFO:tensorflow:loss = 8.756828, step = 30404 (3.257 sec)
INFO:tensorflow:global_step/sec: 30.3686
INFO:tensorflow:loss = 7.5754423, step = 30504 (3.293 sec)
INFO:tenso

Tensor("DecodeCSV:0", shape=(), dtype=int32)
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /media/zadiq/ZHD/datasets/home_credit/model_linear_wide/model.ckpt-36236
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 36237 into /media/zadiq/ZHD/datasets/home_credit/model_linear_wide/model.ckpt.
INFO:tensorflow:loss = 12.140451, step = 36236
INFO:tensorflow:global_step/sec: 21.6296
INFO:tensorflow:loss = 8.897459, step = 36336 (4.625 sec)
INFO:tensorflow:global_step/sec: 31.3607
INFO:tensorflow:loss = 10.872877, step = 36436 (3.188 sec)
INFO:tensorflow:global_step/sec: 30.5514
INFO:tensorflow:loss = 11.580884, step = 36536 (3.273 sec)
INFO:tensorflow:global_step/sec: 29.9194
INFO:tensorflow:loss = 10.421991, step = 36636 (3.343 sec)
INFO:tensorflow:global_step/sec: 30.718

INFO:tensorflow:global_step/sec: 30.2729
INFO:tensorflow:loss = 7.71503, step = 44036 (3.301 sec)
INFO:tensorflow:global_step/sec: 30.8317
INFO:tensorflow:loss = 10.066183, step = 44136 (3.244 sec)
INFO:tensorflow:global_step/sec: 30.4539
INFO:tensorflow:loss = 7.2736707, step = 44236 (3.283 sec)
INFO:tensorflow:global_step/sec: 31.4937
INFO:tensorflow:loss = 9.047091, step = 44336 (3.175 sec)
INFO:tensorflow:global_step/sec: 30.0252
INFO:tensorflow:loss = 15.253031, step = 44436 (3.331 sec)
INFO:tensorflow:global_step/sec: 31.1772
INFO:tensorflow:loss = 12.432341, step = 44536 (3.208 sec)
INFO:tensorflow:global_step/sec: 30.388
INFO:tensorflow:loss = 8.889577, step = 44636 (3.291 sec)
INFO:tensorflow:global_step/sec: 30.2789
INFO:tensorflow:loss = 11.052326, step = 44736 (3.304 sec)
INFO:tensorflow:global_step/sec: 30.515
INFO:tensorflow:loss = 5.6772785, step = 44836 (3.275 sec)
INFO:tensorflow:global_step/sec: 31.1735
INFO:tensorflow:loss = 4.9015174, step = 44936 (3.209 sec)
INFO:t

INFO:tensorflow:loss = 11.71714, step = 52136 (3.188 sec)


KeyboardInterrupt: 

In [11]:
!pip install keras

Collecting keras
  Downloading https://files.pythonhosted.org/packages/68/12/4cabc5c01451eb3b413d19ea151f36e33026fc0efb932bf51bcaf54acbf5/Keras-2.2.0-py2.py3-none-any.whl (300kB)
[K    100% |████████████████████████████████| 307kB 744kB/s ta 0:00:01
[?25hCollecting keras-preprocessing==1.0.1 (from keras)
  Downloading https://files.pythonhosted.org/packages/f8/33/275506afe1d96b221f66f95adba94d1b73f6b6087cfb6132a5655b6fe338/Keras_Preprocessing-1.0.1-py2.py3-none-any.whl
Collecting keras-applications==1.0.2 (from keras)
  Downloading https://files.pythonhosted.org/packages/e2/60/c557075e586e968d7a9c314aa38c236b37cb3ee6b37e8d57152b1a5e0b47/Keras_Applications-1.0.2-py2.py3-none-any.whl (43kB)
[K    100% |████████████████████████████████| 51kB 1.2MB/s ta 0:00:011
Collecting h5py (from keras)
  Downloading https://files.pythonhosted.org/packages/8e/cb/726134109e7bd71d98d1fcc717ffe051767aac42ede0e7326fd1787e5d64/h5py-2.8.0-cp36-cp36m-manylinux1_x86_64.whl (2.8MB)
[K    100% |█████████████

In [16]:
# PREVIEW DATA
dataset = get_input_fn(tf.estimator.ModeKeys.PREDICT)()
data_iter = dataset.make_one_shot_iterator()
sess = tf.Session()
sess.run(data_iter.get_next())