In [1]:
%%javascript
if(IPython.tab_as_tab_everywhere)IPython.tab_as_tab_everywhere()

<IPython.core.display.Javascript object>

In [24]:
import matplotlib.pyplot as plt
import os, sys, json
import pandas as pd
import numpy as np
from tqdm import tqdm
import hashlib

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import InputLayer, Input
from tensorflow.python.keras.layers import Reshape, MaxPooling2D,Dropout
from tensorflow.python.keras.layers import Conv2D, Dense, Flatten
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.optimizers import Adam
from sklearn.metrics import roc_auc_score

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

In [3]:
train = pd.read_csv('data/application_train.csv', encoding='ISO-8859-1').sample(frac=1)
train['INVERSE_TARGET'] = 1 - train['TARGET']
train.loc[train['DAYS_EMPLOYED'] == 365243,'DAYS_EMPLOYED'] = 0
train['DAYS_EMPLOYED'] = -train['DAYS_EMPLOYED']
train['DAYS_BIRTH'] = -train['DAYS_BIRTH']
train['DAYS_REGISTRATION'] = -train['DAYS_REGISTRATION']

In [5]:
df = pd.DataFrame()

df['CNT_CHILDREN'] = train['CNT_CHILDREN'].clip(0,4)
df['DAYS_BIRTH'] = train['DAYS_BIRTH']
df['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED']
df['DAYS_REGISTRATION'] = train['DAYS_REGISTRATION']

#documents
df['documents'] = 1
for id_ in range(2,22):
	df['documents'] += train['FLAG_DOCUMENT_' + str(id_)]
	
#objects
preprocess = {}
for col in train:
	if train[col].dtype.kind in 'bif' and not col in ['TARGET','INVERSE_TARGET','SK_ID_CURR'] and not col in df.columns:
		noNaN = train[col].count() == len(train)
		if noNaN:
			df[col] = train[col]
		else:
			df[col] = train[col].fillna(0)
			df[col + '_nan'] = train[col].isna()
		preprocess[col] ={
			'type':'num',
			'noNaN': int(noNaN)
		}
		
	
	if train[col].dtype != 'O': continue
	
	train[col] = train[col].fillna('none')
	dic = train.groupby([col])['TARGET'].mean().to_dict()
	preprocess[col] = {
		'type':'dic',
		'dic':dic
	}
	
	df[col] = train[col].map(dic)

In [7]:
use_cols = pd.DataFrame(columns=['col','cor'])
for col in df:
	cor = np.corrcoef(train['TARGET'],df[col])[0][1]
	use_cols = use_cols.append({
		'col': col,
		'cor': cor,
		'abs': abs(cor)
	},ignore_index=True)

In [9]:
use = use_cols.sort_values(by=['abs'],ascending=False).head(150)['col'].tolist()
df_use = df[use]

In [11]:
df_norm = pd.DataFrame()

lims = {}

for col in df_use:
	column = df_use[col]
	d_min = float(column.min())
	d_max = float(column.max())
	dif = d_max - d_min
	
	lims[col] = {
		'a': d_min,
		'b': dif,
	}
	
	
	df_norm[col] = (df_use[col] - d_min) / dif

In [12]:
columns = len(df_use.columns)
train_percent = 0.9
train_num = int(len(train) * train_percent)
test_num = len(train) - train_num

train_x = df_norm.head(train_num).as_matrix()
train_y = train[['TARGET','INVERSE_TARGET']].head(train_num).as_matrix()

test_x = df_norm.tail(test_num).as_matrix()
test_y = train[['TARGET','INVERSE_TARGET']].tail(test_num).as_matrix()

In [32]:
hashlib.md5(str.encode(str(df_use.columns))).hexdigest()

'1a3a9138a352a568d1a66a15d31fdbe2'

In [14]:
# Create an input layer which is similar to a feed_dict in TensorFlow.
# Note that the input-shape must be a tuple containing the image-size.
inputs = Input(shape=(columns,))

# Variable used for building the Neural Network.
net = inputs

# First fully-connected / dense layer with ReLU-activation.
w = 300
h = 4
for _ in range(h):
	net = Dense(w, activation='relu')(net)

net = Dropout(0.5)(net)

net = Dense(2, activation='softmax')(net)

# Output of the Neural Network.
outputs = net
model = Model(inputs=inputs, outputs=outputs)

In [15]:
model.compile(optimizer='sgd', loss='categorical_crossentropy')

In [16]:
for i in range(1):
	model.fit(x=train_x, y=train_y,validation_split=0.2,epochs=10, batch_size=128)
	print('roc' + str(i),roc_auc_score(test_y[:,0],model.predict(test_x)[:,0]))

Train on 221407 samples, validate on 55352 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
roc0 0.7443005592524423


In [19]:
model.save('credit.keras')

In [20]:
json.dump({
	'preprocess': preprocess,
	'scale': lims,
	'use': use
},open('credit.json', 'w'))

In [21]:
roc_auc_score(train_y[:,0],model.predict(train_x)[:,0])

0.7460604805884006