# Imports

In [None]:
import os
import time
import datetime
import pickle
import random
from datetime import datetime

import numpy as np
import pandas as pd

from scipy.stats import yulesimon

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('white')

from sklearn.model_selection import train_test_split

import tensorflow as tf

import keras
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras import backend as K
from keras import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization, Conv1D, Flatten
from keras.optimizers import Adam, SGD
from keras.regularizers import l1, l2, l1_l2
from keras.metrics import mean_squared_error

# fix random seed for reproducability
def fix_random(seed):
    os.environ['PYTONHASHSEED'] = '0'
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)

# Generate data
### &nbsp;&nbsp; *num_alphas*: number of alphas generated between (min_alpha) and (max_alpha) inclusive
### &nbsp;&nbsp; *samples_per_alpha*: number of samples (rows) for each alpha 
### &nbsp;&nbsp; *N* : number of random variates (number of samples drawn from yulesimon distribution)
### &nbsp;&nbsp; *M* : maximun value of random variates (length of input vectors == number of features)

In [None]:
def generate_data(num_alphas, samples_per_alpha, N, min_alpha=2.01, max_alpha=3.00, random_alpha=False, random_state=0):
    '''
    params:
        num_alphas: number of alphas generated between (min_alpha) and (max_alpha) inclusive
        samples_per_alpha: number of samples (rows) for each alpha 
        N: number of RV samples (columns) per row
    '''

    # fix loc at zero
    loc = 0

    X = np.empty((num_alphas * samples_per_alpha, N+1), float)

    row = 0
    
    if random_alpha:
        alphas = np.random.uniform(low=min_alpha, high=max_alpha, size=num_alphas)
    else:
        alphas = np.linspace(min_alpha, max_alpha, num=num_alphas)
    
    for alpha in alphas:
        
        # generate samples (rows) for current alpha
        for i in range(samples_per_alpha):
            X[row, 0] = alpha
            X[row, 1:] = yulesimon.rvs(alpha, loc=loc, size=N, random_state=random_state)
            row += 1

    # suffle rows
    np.random.shuffle(X)

    # separate X from y
    y = X[:, 0]
    X = X[:, 1:].astype(int)

    # create a histogram (H) from (X) rows
    nbins = np.max(X)
    H = np.apply_along_axis(lambda a: np.histogram(a, bins=nbins, density=False)[0], 1, X)

    # log scale (H) rows
    logH = np.apply_along_axis(lambda a: np.log10(a+1), 1, H)

    return logH, y, nbins # (nbins == M)

# Create Model

In [None]:
def create_model(n_features, filters=32):

    model = Sequential()
    model.add(Conv1D(filters, 2, activation="relu", input_shape=(n_features,1)))
    model.add(Flatten())
    model.add(Dense(64, activation="relu"))
    model.add(Dense(1))
    model.compile(loss="mse", optimizer="adam")

    return model


# Training

In [None]:
def train(X_train, y_train, filters=32, batch_size=32, random_state=0):

    # split train/val
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
        test_size=0.25, random_state=random_state)

    # create model
    model = create_model(X_train.shape[1], filters=filters)
    
    # early-stopping
    es_patience = 50
    es = EarlyStopping(monitor='val_loss', 
                        patience=es_patience, 
                        mode='min', 
                        restore_best_weights=True, 
                        verbose=0)
    
    # model checkpoint
    if not os.path.exists('models'):
        os.makedirs('models')
    date_str = datetime.now().strftime('%d-%m-%Y_%H-%M-%S')
    base_path       = 'models/yulesimon_{}'.format(date_str)
    model_path      = '{}.h5'.format(base_path)
    history_path    = '{}.history'.format(base_path)
    
    cp = ModelCheckpoint(filepath=model_path, monitor='val_loss', mode='min', save_best_only=True, verbose=0)
    
    # reduce learning-rate on plateau
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.95, patience=10)
    
    # fit model
    history = model.fit(X_train, 
                        y_train, 
                        validation_data=(X_val, y_val), 
                        epochs=200, 
                        batch_size=batch_size, 
                        shuffle=False, 
                        callbacks=[es, reduce_lr, cp], 
                        verbose=0)
    
    # save history with model
    with open(history_path, 'wb') as f:
        pickle.dump(history.history, f)
    
    # load best weights from last checkpoint
    model = keras.models.load_model(model_path)
    return model, history.history

In [None]:
def plot_learning_curves(history, train_key='loss', val_key='val_loss'):
    plt.figure(figsize=(2,2))
    plt.plot(history[train_key])
    plt.plot(history[val_key])
    plt.title('learning curves')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper right')
    plt.show()
    print('loss:', np.min(history['loss']))
    print('val_loss:', np.min(history['val_loss']))    

# Trials

In [None]:
def trial(random_alpha=False):

    print('Processing...')
    print()

    a_N = np.array([])
    a_sqrt_mse = np.array([])

    # 10 random states
    random_states = [0, 3, 5, 8, 11, 16, 17, 20, 21, 24]

    # hold array of absolute errors
    abs_errors = np.array([], dtype=float)

    # change N [32..2048]
    for i in range(5, 12):

        N = 2**i

        a_N = np.append(a_N, N)

        X, y, M = generate_data(num_alphas=100, samples_per_alpha=100, N=N, random_alpha=random_alpha, random_state=0)

        # reshape X for Conv1D
        X = X.reshape(X.shape[0], X.shape[1], 1)

        # split train/test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

        avg_sqrt_mse = 0

        # change random state
        for rs in random_states:

            # fix random
            fix_random(seed=rs)
        
            # training
            model, history = train(X_train, y_train, filters=32, batch_size=32, random_state=rs)

            # predict
            y_pred = model.predict(X_test).flatten()

            # mse
            mse = mean_squared_error(y_test, y_pred)

            # sqrt_mse
            sqrt_mse = np.sqrt(mse)

            # avg_sqrt_mse (accumulator)
            avg_sqrt_mse += sqrt_mse
            
            print('N = {}, M = {}, random_state = {}    =>    sqrt_mse = {:.6f}'.format(N, M, rs, sqrt_mse))

            # absolute errors
            abs_errors = np.append(abs_errors, np.abs(y_test - y_pred))

        # avg_sqrt_mse
        avg_sqrt_mse = avg_sqrt_mse / len(random_states)
        a_sqrt_mse = np.append(a_sqrt_mse, avg_sqrt_mse)
        print('N = {}, M = {}    =>    avg_sqrt_mse = {:.6f}'.format(N, M, avg_sqrt_mse))
        print()

    return abs_errors, a_N, a_sqrt_mse

In [None]:
abs_errors_1, a_N_1, a_sqrt_mse_1 = trial(random_alpha=False)
abs_errors_2, a_N_2, a_sqrt_mse_2 = trial(random_alpha=True)

# plot log(N) vs sqrt_mse

In [None]:
def plot_sqrt_mse():
    
    plt.figure(figsize=(15, 4))
    
    ax1 = plt.subplot(1, 2, 1)
    ax1.set(title='random_alpha = False', xlabel='log(N)', ylabel='sqrt_mse')
    ax1.scatter(np.log10(a_N_1), a_sqrt_mse_1)

    ax2 = plt.subplot(1, 2, 2)
    ax2.set(title='random_alpha = True', xlabel='log(N)', ylabel='sqrt_mse')
    ax2.scatter(np.log10(a_N_2), a_sqrt_mse_2)

plot_sqrt_mse()

# plot error distribution

In [None]:
_ = sns.distplot(abs_errors_1)

In [None]:
_ = sns.distplot(abs_errors_2)