In [1]:
import sys
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

TRAIN_PATH = "../../data/train.csv"
TEST_PATH = "../../data/test.csv"

def get_data(path):
    data = pd.read_csv(path)
    return data

def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ],dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [None]:
#手元評価用(splitする)
data = get_data(TRAIN_PATH)

#カテゴリ変数のダミー化
calc =data.filter(regex="calc").columns
data = data.drop(calc, axis=1)
cat = data.filter(regex="cat").columns
bin = data.filter(regex="bin").columns
cat = cat.append(bin)
non_cat = list(set(data.columns) - set(cat))

for tag in cat:
    data[tag] = data[tag].astype(str)
data = pd.concat((data, pd.get_dummies(data[cat])), axis=1).drop(cat,axis=1)

#欠損値の存在するnon_cat値
for tag in non_cat:
    data[tag][data[tag]==-1] = data[tag][data[tag]!=-1].mean()
    
TEACHER = np.c_[data.target.values]
data = data.drop(['id', 'target'], axis=1).values

X_train, X_test, Y_train, Y_test = train_test_split(data, TEACHER, random_state=0)
#stratify=TEACHER
del data, TEACHER

In [2]:
#本番訓練用
data = get_data(TRAIN_PATH)
#カテゴリ変数のダミー化
calc =data.filter(regex="calc").columns
data = data.drop(calc, axis=1)
cat = data.filter(regex="cat").columns
bin = data.filter(regex="bin").columns
cat = cat.append(bin)
non_cat = list(set(data.columns) - set(cat))

for tag in cat:
    data[tag] = data[tag].astype(str)
data = pd.concat((data, pd.get_dummies(data[cat])), axis=1).drop(cat,axis=1)

#欠損値の存在するnon_cat値
for tag in non_cat:
    data[tag][data[tag]==-1] = data[tag][data[tag]!=-1].mean()
    
TEACHER = np.c_[data.target.values]
data = data.drop(['id', 'target'], axis=1).values

In [3]:
#implemented in Keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.core import Dropout
from keras.optimizers import Adagrad, Adamax, Adam
#from keras.layers.normalization import BatchNormalization

model = Sequential()

model.add(Dense(200, input_dim=218))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(100))
#model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=Adamax(), metrics=['accuracy'])

Using TensorFlow backend.


In [10]:
#手元評価用
for i in range(5):
    model.fit(X_train, Y_train, epochs=3, batch_size=1000)
    result = model.predict(X_test, batch_size=1000, verbose=1).T[0]
    answer = Y_test.T[0]
    print('***score***', gini_normalized(answer, result))
    print()

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.268426275003

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.279219059892

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.282717884453

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.277328704171

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.279279153041



In [4]:
#手元評価用 #relu 200-100 dropout=0.5 
for i in range(5):
    model.fit(X_train, Y_train, epochs=3, batch_size=1000)
    result = model.predict(X_test, batch_size=1000, verbose=1).T[0]
    answer = Y_test.T[0]
    print('***score***', gini_normalized(answer, result))
    print()

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.25722049444

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.26589511823

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.274989459028

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.270618445914

Epoch 1/3
Epoch 2/3
Epoch 3/3
***score*** 0.279140846332



In [4]:
#本番訓練用
model.fit(data, TEACHER, epochs=20, batch_size=1000)
result = model.predict(data, batch_size=1000, verbose=1).T[0]
answer = TEACHER.T[0]
gini_normalized(answer, result)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


0.31926231188044057

In [5]:
del result
del answer
del data
del TEACHER

In [6]:
#提出側
data = get_data(TEST_PATH)

calc =data.filter(regex="calc").columns
data = data.drop(calc, axis=1)
cat = data.filter(regex="cat").columns
bin = data.filter(regex="bin").columns
cat = cat.append(bin)
non_cat = list(set(data.columns) - set(cat))

for tag in cat:
    data[tag] = data[tag].astype(str)
data = pd.concat((data, pd.get_dummies(data[cat])), axis=1).drop(cat,axis=1)

#欠損値の存在するnon_cat値
for tag in non_cat:
    data[tag][data[tag]==-1] = data[tag][data[tag]!=-1].mean()
    
#不要な行を削除しつつnp.arrayに変換
id = data.id.values
data = data.drop('id', axis=1).values

result = model.predict(data, batch_size=500, verbose=1).T[0]



In [7]:
f = open('result.csv', 'w') # 書き込みモードで開く

f.write('id,target\n')
for i in range(len(id)):
    f.write(str(id[i])+','+str(result[i])+'\n')
    
f.close() # ファイルを閉じる