In [1]:
import numpy as np
import os
import random
import lightgbm as lgb

from os.path import join as pjoin
from collections import defaultdict
from functools import reduce

from keras.layers import Dense, Input, BatchNormalization, Activation
from keras.models import Model
from keras.optimizers import Adam, SGD, RMSprop
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping

from sklearn.svm import SVC

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [2]:
base = 128
datadir = 'data'
dataset_size=1e6
filename = 'Base{}.txt'.format(base)
train_split=0.8
val_split=0.1
test_split=0.1
batch_size=500
lr = 1e-1

In [3]:
def get_data(prepare=True):
    data = []
    for row in open(pjoin(datadir, filename)):
        if len(row.split()) == 2:
            inp, target = row.split()
            data.append([np.array([int(x) if prepare else float(x) for x in inp]), float(target)])
    if prepare: # Magick preprocessing
        new_data = []
        size = len(data)
        for x, y in data:
            s = np.zeros_like(x)
            lamb = reduce(lambda a, b: a^b, x, 0)
            for i in range(len(x)):
                s[i] = float(-1 if lamb == 1 else 1)
                lamb ^= x[i]
            new_data.append([s, y])
        data = new_data
    random.shuffle(data)
    train_count = int(dataset_size*train_split)
    val_count = int(dataset_size*val_split)
    test_count = int(dataset_size*test_split)
    return data[:train_count], data[train_count:train_count+val_count], \
           data[train_count+val_count:train_count+val_count+test_count]

In [4]:
train_data, val_data, test_data = get_data()

In [5]:
x_train, y_train = list(map(np.array, zip(*train_data)))
x_val, y_val = list(map(np.array, zip(*test_data)))
x_test, y_test = list(map(np.array, zip(*test_data)))

In [6]:
print(x_train[:10])

[[-1 -1  1 ...  1 -1 -1]
 [-1  1  1 ... -1 -1 -1]
 [ 1 -1  1 ... -1 -1  1]
 ...
 [-1  1  1 ... -1 -1 -1]
 [ 1  1  1 ...  1  1 -1]
 [ 1 -1  1 ... -1  1  1]]


In [7]:
def nn_model():
    optimizer = RMSprop(lr)
    inp = Input((base,))
    x = Dense(base//2, activation='relu')(inp)
    x = Dense(base//4, activation='relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inp, x)
    model.compile(loss='binary_crossentropy', metrics=['binary_accuracy'], optimizer=optimizer)
    return model
    

In [8]:
def step_decay_schedule(initial_lr=1e-3, decay_factor=0.9, min_lr=1e-4):
    def schedule(epoch):
        return max(min_lr, initial_lr * (decay_factor ** (epoch)))
    
    return LearningRateScheduler(schedule)

In [9]:
def cosine_anneal_schedule(t, alpha_zero=1e-2):
    T, M = 80, 10
    cos_inner = np.pi * ((t+1) % (T // M))
    cos_inner /= T // M
    cos_out = np.cos(cos_inner) + 1
    return max(float(alpha_zero / 2 * cos_out), alpha_zero / 10)

In [10]:
model = nn_model()
early_stopping_callback = EarlyStopping(monitor='loss', patience=15, min_delta=1e-4)
lr_callback = LearningRateScheduler(cosine_anneal_schedule)

lr=1e-3
fl = False
callbacks = [early_stopping_callback, lr_callback] if fl else [early_stopping_callback]
model.fit(x_train, y_train,
        epochs=100,
        batch_size=10000,
        validation_data=(x_test, y_test),
        callbacks = callbacks, verbose=1)

Train on 800000 samples, validate on 100000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100

KeyboardInterrupt: 

In [None]:
def reset_lr(min_lr=1e-3, mult=0.99):
    def callback(env):
        last_lr = env.params['learning_rate']
        env.params['learning_rate'] = last_lr*mult if last_lr > min_lr else lr
    callback.before_iteration = True
    callback.order = 0
    return callback

In [None]:
def get_datasets():
    train_dataset = lgb.Dataset(x_train, label=y_train, \
                         feature_name=['c{}'.format(i) for i in range(base)], \
                         categorical_feature=['c{}'.format(i) for i in range(base)])
    val_dataset = lgb.Dataset(x_val, label=y_val, \
                             feature_name=['c{}'.format(i) for i in range(base)], \
                             categorical_feature=['c{}'.format(i) for i in range(base)])
    return train_dataset, val_dataset

lr=0.1

param = {'num_leaves': 2047, 
         'num_trees':1000, 
         'objective':'binary', 
         'learning_rate' : lr,
         'boosting': 'dart',
         'max_bin': 2047}
param['metric'] = ['auc', 'binary_logloss']

train_dataset, val_dataset = get_datasets()

num_round = 1000
bst = lgb.train(param, train_dataset, num_round, valid_sets=[val_dataset])#, \
                    #callbacks=[reset_lr(lr/0.1)], early_stopping_rounds=100)

In [9]:
clf = SVC(kernel='poly', verbose=2)
clf.fit(x_train[:50000], y_train[:50000])

[LibSVM]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='poly', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=2)

In [10]:
res = clf.predict(x_test[:10000])
print(1 - np.abs(np.array(res) - np.array(y_test[:10000])).sum() / len(y_test[:10000]))

0.5981000000000001
