# Repsly trial data

In [1]:
from repsly_data import RepslyData

repsly_data = RepslyData()
print('Reading data (this might take a minute or so)...', end='')
repsly_data.read_data('data/trial_users_analysis.csv', mode='FC')
print('done.')

Reading data (this might take a minute or so)...done.



Let's see what the data looks like:

In [2]:
read_batch = repsly_data.read_batch(batch_size=20)

X, y = next(read_batch)
print('X{}: {}'.format(list(X.shape), X))
print('y:', y)

X[20, 241]: [[303   1   0 ...,   0   0   0]
 [192   4   4 ...,   0   0   0]
 [363   0   0 ...,   0   0   0]
 ..., 
 [180   0   0 ...,   0   0   0]
 [336   0   0 ...,   0   0   0]
 [459   2   1 ...,   0   0   0]]
y: [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0]


As you can see above, each input vector `X` has `1+15*16=241` values, most of which are zeros. The first one is the trial start date as offset from `2016-01-01` and the rest is different usage parameters for the following `16` days. Data provided by batch read is randomly shuffled. Output values are stored in `y` and they represent if the user purchased the Repsly service after the trial or not.

# Training

We will use Ensamble class for training and cross validation

In [3]:
from repsly_nn import RepslyFC
from ensamble import Ensamble

ens = Ensamble()

In [None]:
arch = {
        'no_of_layers': {'lin': (4, 8)},
        'hidden_size': {'lin': (128, 384)},
        'use_batch_norm': 'True',
        'keep_prob': {'lin': (0.3, 0.70, 2)},
        'input_keep_prob': {'lin': (0.65, 0.95, 2)},
        'batch_norm_decay': 0.99 # {'inv-log': (0.9, 0.99, 2)},
}
learning_dict = {
    'learning_rate': 0.001,
    'decay_steps': 20,
    'decay_rate': 0.99 #{'inv-log': (0.99, 0.999, 3)}
}
train_dict = {
    'batch_size': 512,
    'epochs': 100,
    'skip_steps': 20
}
key='f1_score'

no_of_nets = 5
no_of_loops = 50

for _ in range(no_of_loops):
    ens.add_nets(RepslyFC, arch=arch, data=repsly_data, learning_dict=learning_dict, no_of_nets=no_of_nets)
    ens.train_untrained(train_dict)
    ens.print_stat_by_key('f1_score')


RepslyFC/no_of_layers-6/hidden_size-369/use_batch_norm-True/keep_prob-0.6/input_keep_prob-0.76/batch_norm_decay-0.99/lr-0.001/dr-0.99/ds-20
RepslyFC/no_of_layers-7/hidden_size-223/use_batch_norm-True/keep_prob-0.67/input_keep_prob-0.66/batch_norm_decay-0.99/lr-0.001/dr-0.99/ds-20
RepslyFC/no_of_layers-6/hidden_size-258/use_batch_norm-True/keep_prob-0.38/input_keep_prob-0.78/batch_norm_decay-0.99/lr-0.001/dr-0.99/ds-20
RepslyFC/no_of_layers-5/hidden_size-300/use_batch_norm-True/keep_prob-0.6/input_keep_prob-0.88/batch_norm_decay-0.99/lr-0.001/dr-0.99/ds-20
RepslyFC/no_of_layers-6/hidden_size-177/use_batch_norm-True/keep_prob-0.42/input_keep_prob-0.81/batch_norm_decay-0.99/lr-0.001/dr-0.99/ds-20
################################################################################
Checkpoint directory is: /Users/davor/projects/deep_learning/repsly_challenge/checkpoints/RepslyFC/no_of_layers-6/hidden_size-369/use_batch_norm-True/keep_prob-0.6/input_keep_prob-0.76/batch_norm_decay-0.99/lr-0.001/

We will train the best candidates a little bit more:

In [None]:
no_of_top_nets = 0
no_of_loops = 0

for _ in range(no_of_loops):
    ens.train_top_nets_by_key_stat(key, no_of_top_nets, train_dict)
    ens.print_stat_by_key('f1_score')

In [None]:
ens.print_stat_by_key('f1_score')

In [None]:
ens.print_stat_by_key('loss', reverse=True)

In [None]:
arch = {
        'no_of_layers': 6,
        'hidden_size': 256,
        'use_batch_norm': 'True',
        'keep_prob': 0.68,
        'input_keep_prob': 0.72,
        'batch_norm_decay': 0.99
}
learning_dict = {
    'learning_rate': 0.001,
    'decay_steps': 20,
    'decay_rate': 0.99
}
train_dict = {
    'batch_size': 512,
    'epochs': 100,
    'skip_steps': 20
}
