# Imports

In [1]:
import numpy as np
from scipy import stats
import random

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# import library
%run lib.ipynb

# Generate data (yulesimon)

In [2]:
from scipy.stats import yulesimon

def sample_yulesimon(alpha, size):
    return yulesimon.rvs(alpha, loc=0, size=size)

def next_alpha(min_alpha=2.0, max_alpha=3.0):
    alpha = np.random.uniform(low=min_alpha, high=max_alpha, size=1)[0]
    return alpha

# generate data (yulesimon)
def generate_data_yulesimon(N, M, density=False):
    H, alphas = generate_data(N=N, M=M, nextConfig=next_alpha, sample=sample_yulesimon)

    # split train/test
    # (use train_test_split so the shape of the train/test data will be the same)
    H_train, H_test, y_train, y_test = train_test_split(H, alphas, test_size=0.25)
    
    return H_train, y_train, H_test, y_test

# Trials: train on H vs. on logH

In [None]:
def normalize(H):
    # normalize values (sum to 1)
    return H / H.sum(axis=1, keepdims=1)

def log_sacle(H):
    # log scale H rows
    # (shift H values by one so as not to take log of zero)
    return np.apply_along_axis(lambda a: np.log10(a), 1, H + 1)

def trial1(M_values):

    trials_out = []

    d = {
        'M': [], 
        'm': [],
        'log-scale': [],
        #'density': [], #lilo:density
        'sqrt-mse': [],
        'train-time (sec.)': []
    }
    
    N = 10000
    
    for M in M_values:
        
        print()
        print(f'generating dataset (M: {M}) ... ', end='')
        X_train, y_train, X_test, y_test = generate_data_yulesimon(N=N, M=M)
        print(f'training input.shape: {X_train.shape}')
        m = X_train.shape[1]

        
        #lilo:density
        #for density in [True, False]:
        for density in [False]:

            if density:
                X_train = normalize(X_train)
                X_test = normalize(X_test)

            for log_scale in [True, False]:

                if log_scale:
                    X_train = log_sacle(X_train)
                    X_test = log_sacle(X_test)

                print(f'training - log-scale: {log_scale}', end=' ... ')

                start_time = time.time()
                dnn_model, history, y_pred, sqrt_mse = dnn_trial(X_train, y_train, X_test, y_test)
                train_time = round(time.time() - start_time)
                
                # debug print
                loss = np.min(history['loss'])
                val_loss = np.min(history['val_loss'])
                print(f'sqrt-mse: {sqrt_mse:.3f}', end=', ')
                print(f'loss: {loss:.3f}', end=', ')
                print(f'val_loss: {val_loss:.3f}', end=', ')
                print(f'train-time: {round(train_time)} sec.')

                # plot learning curves
                #plot_learning_curves(history)

                trials_out.append({
                    'model': dnn_model,
                    'history': history,
                    'y_test': y_test,
                    'y_pred': y_pred,
                    'sqrt-mse': sqrt_mse
                })
                
                d['M'].append(M)
                d['m'].append(m)
                d['log-scale'].append(log_scale)
                #d['density'].append(density) #lilo:density
                d['sqrt-mse'].append(round(sqrt_mse, 3))
                d['train-time (sec.)'].append(f'{train_time}')

    df = pd.DataFrame(data=d)
    
    return trials_out, df

trials_out, df = trial1(M_values = [256, 512, 1024, 2048, 4096, 8192])

df


generating dataset (M: 256)... training input.shape: (7500, 2462)
training - log-scale: True... 

Runing for M in [256, 512, 1024, 2048, 4096]  
For each M run with density=True/False  
Measure execution time  

We can see that:  
1. MSE decreases for larger M
2. Training time on logM is silightly better then training on M

| M    | train | sqrt_mse | time | density | m    |
| ---  | ---   | ---      | ---  | ---     | ---  |
|  256 | H     | 0.209    | 1m   | True    | 2243 |
|  256 | logH  | 0.198    | 1m   | True    | 2243 |
|  256 | H     | 0.209    | 1m   | False   | 1577 |
|  256 | logH  | 0.205    | 30s  | False   | 1577 |
|  512 | H     | 0.168    | 60s  | True    | 3530 |
|  512 | logH  | 0.162    | 2m   | True    | 3530 |
|  512 | H     | 0.163    | 50s  | False   |  880 |
|  512 | logH  | 0.174    | 20s  | False   |  880 |
| 1024 | H     | 0.114    | 2-4m | True    | 2700 |
| 1024 | logH  | 0.114    | 1-3m | True    | 2700 |
| 1024 | H     | 0.124    | 38s  | False   | 1500 |
| 1024 | logH  | 0.141    | 29s  | False   | 1500 |
| 2048 | H     | 0.101    | 53s  | True    | 2306 |
| 2048 | logH  | 0.089    | 1m   | True    | 2306 |
| 2048 | H     | 0.091    | 2m   | False   | 2674 |
| 2048 | logH  | 0.102    | 45s  | False   | 2674 |
| 4096 | H     | 0.071    | 2.5m | True    | 7467 |
| 4096 | logH  | 0.066    | 3m   | True    | 7467 |
| 4096 | H     | 0.068    | 3.5m | False   | 5367 |
| 4096 | logH  | 0.083    | 2m   | False   | 5367 |
