In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [2]:
def scale_data(X, scaler=None):
    if not scaler:
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

In [3]:
# train and test data path
DATA_TRAIN_PATH = '../kaggle/Porto Seguro/input/train.csv'
DATA_TEST_PATH = '../kaggle/Porto Seguro/input/test.csv'

def load_data(path_train=DATA_TRAIN_PATH, path_test=DATA_TEST_PATH):
    train_loader = pd.read_csv(path_train, dtype={'target': np.int8, 'id': np.int32})
    train = train_loader.drop(['target', 'id'], axis=1)
    train_labels = train_loader['target'].values
    train_ids = train_loader['id'].values
    print('\n Shape of raw train data:', train.shape)

    test_loader = pd.read_csv(path_test, dtype={'id': np.int32})
    test = test_loader.drop(['id'], axis=1)
    test_ids = test_loader['id'].values
    print(' Shape of raw test data:', test.shape)

    return train, train_labels, test, train_ids, test_ids

In [4]:
# Load data set and target values
train, target, test, tr_ids, te_ids = load_data()
n_train = train.shape[0]
train_test = pd.concat((train, test)).reset_index(drop=True)

calc_col = [c for c in train.columns if c.startswith('ps_calc_')]
train_test.drop(calc_col, axis=1, inplace=True)

col_to_drop = train.columns[train.columns.str.endswith('_cat')]
col_to_dummify = train.columns[train.columns.str.endswith('_cat')].astype(str).tolist()

dummy_df = pd.DataFrame()
for col in col_to_dummify:
    dummy = pd.get_dummies(train_test[col].astype('category'))
    columns = dummy.columns.astype(str).tolist()
    columns = [col + '_' + w for w in columns]
    dummy.columns = columns
    dummy_df = pd.concat((dummy_df, dummy), axis=1)

train_test.drop(col_to_dummify, axis=1, inplace=True)
train_test_scaled, scaler = scale_data(train_test)
train_test_scaled = np.concatenate((train_test_scaled, dummy_df.values), axis=1)
train = train_test_scaled[:n_train, :]
test = train_test_scaled[n_train:, :]
print('\n Shape of processed train data:', train.shape)
print(' Shape of processed test data:', test.shape)


 Shape of raw train data: (595212, 57)
 Shape of raw test data: (892816, 57)

 Shape of processed train data: (595212, 207)
 Shape of processed test data: (892816, 207)


In [5]:
from sklearn.cross_validation import train_test_split



In [6]:
x_train, x_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=0)

In [7]:
y_train = y_train.reshape([-1, 1])
y_test = y_test.reshape([-1, 1])
y_train.shape, y_test.shape

((476169, 1), (119043, 1))

In [8]:
import tensorflow as tf

In [9]:
n = x_train.shape[1]
# 潜在因子，越大拟合能力越强，越小泛化能力越强
k = 5

X = tf.placeholder('float', shape=[None, n])
y = tf.placeholder('float', shape=[None, 1])

# 初始化 0 次项，1 次项，辅助向量 v
w0 = tf.Variable(tf.zeros([1]))
W = tf.Variable(tf.zeros([n])) 
V = tf.Variable(tf.random_normal([n, k], stddev=0.01))

In [10]:
# FM 公式前两部分
linear_terms = tf.add(w0,
                      tf.reduce_sum(tf.multiply(W, X),
                                    axis=1,
                                    keep_dims=True))

In [11]:
# FM 公式最后一部分
interactions = (tf.multiply(0.5,
                tf.reduce_sum(
                    tf.subtract(
                        tf.pow(tf.matmul(X, V), 2),
                        tf.matmul(tf.pow(X, 2), tf.pow(V, 2))),
                    1, keep_dims=True)))

In [12]:
# 交叉熵损失
logits = tf.add(linear_terms, interactions)
y_hat = tf.sigmoid(logits)
loss = tf.losses.sigmoid_cross_entropy(y, logits)

INFO:tensorflow:logits.dtype=<dtype: 'float32'>.
INFO:tensorflow:multi_class_labels.dtype=<dtype: 'float32'>.
INFO:tensorflow:losses.dtype=<dtype: 'float32'>.


In [13]:
eta = tf.constant(0.1)
optimizer = tf.train.AdagradOptimizer(eta).minimize(loss)

In [14]:
N_EPOCHS = 100
batch_size = 2048
num_steps = x_train.shape[0] // batch_size + 1
batch_size, num_steps

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)

    for epoch in range(N_EPOCHS):
        indices = np.arange(x_train.shape[0])
        np.random.shuffle(indices)
        x_train, y_train = x_train[indices], y_train[indices]
        for step in range(num_steps):
            offset = (step * batch_size) % (x_train.shape[0] - batch_size)
            x_batch = x_train[offset:(offset + batch_size), :]
            y_batch = y_train[offset:(offset + batch_size), :]
            sess.run(optimizer, feed_dict={X: x_batch, y: y_batch})
        test_pred = sess.run(y_hat, feed_dict={X: x_test}).flatten()
        auc = metrics.roc_auc_score(y_score=test_pred, y_true=y_test.flatten())
        logloss = metrics.log_loss(y_pred=test_pred.tolist(), y_true=y_test.flatten().tolist())
        print(auc, logloss)
#     print('train MSE: ', sess.run(loss, feed_dict={X: x_train, y: y_train}))
#     print('test MSE: ', sess.run(loss, feed_dict={X: x_test, y: y_test}))
#     test_pred = sess.run(y_hat, feed_dict={X: x_test}).flatten()
#     print('test AUC: ', metrics.roc_auc_score(y_score=test_pred, y_true=y_test.flatten()))
#     print('test Logloss: ', metrics.log_loss(y_pred=test_pred.tolist(), y_true=y_test.flatten()))

0.61497253798 0.152767066745
0.619925546863 0.152460569993
0.62241424577 0.152290122218
0.623658351091 0.152175387746
0.624337491699 0.152157207836
0.624974966117 0.152070547131
0.625337152741 0.152042165406
0.626176870483 0.152091716624
0.626216701893 0.15196906483
0.627147227608 0.152018647046
0.627178299586 0.151943730239
0.628044657527 0.151862197424
0.628587726185 0.151992469726
0.628872430496 0.1518703834
0.629698578951 0.151788684788
0.629683070762 0.15175463274
0.630598443113 0.151725005104
0.630900222505 0.151687234047
0.631441780881 0.151671391729
0.630499608609 0.151723677746
0.63211700527 0.151612973733
0.632255106093 0.151618560218
0.631933031909 0.151650318449
0.631990671302 0.151610818767
0.631868721599 0.151637860595
0.632615971473 0.151592232138
0.632765787595 0.151618607445
0.632778620948 0.151690168359
0.633337058291 0.151544171588
0.633596543725 0.151549666885
0.633819724772 0.151565636867
0.633212179005 0.151560510785
0.633856271774 0.15154803358
0.634149020812 0.1

In [15]:
0.63613802668 * 2 - 1

0.27227605335999994