# Imports

In [1]:
import numpy as np
from scipy import stats
import random

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# import library
%run lib.ipynb

# Generate data (yulesimon)

In [2]:
from scipy.stats import yulesimon

def sample_yulesimon(alpha, size):
    return yulesimon.rvs(alpha, loc=0, size=size)

def next_alpha(min_alpha=2.0, max_alpha=3.0):
    alpha = np.random.uniform(low=min_alpha, high=max_alpha, size=1)[0]
    return alpha

# generate data (yulesimon)
def generate_data_yulesimon(N, M, density=False):
    H, alphas = generate_data(N=N, M=M, nextConfig=next_alpha, sample=sample_yulesimon)

    # split train/test
    # (use train_test_split so the shape of the train/test data will be the same)
    H_train, H_test, y_train, y_test = train_test_split(H, alphas, test_size=0.25)
    
    return H_train, y_train, H_test, y_test

# Experiments

## Log Experiment
Run an experiment to show that using the log function improves something, either accuracy or learning rate, or both.
Use fixed setting, e.g, YS, samples = 256 or 512.
In this experiment use the LOG(1+X) function.

In [3]:
def log_sacle(H, C=0):
    # log scale H rows
    # (shift H values by one so as not to take log of zero)
    return np.apply_along_axis(lambda a: np.log10(a + 1 + C), 1, H)

def normalize(H):
    # normalize values (sum to 1)
    return H / H.sum(axis=1, keepdims=1)

def Experiment(M_values, N=10000, C_values=[0], train_log_scale=[True, False]):

    trials_out = []

    d = {
        'M': [], 
        'm': [],
        'C': [], 
        'sqrt-mse': [],
        'train-time (sec.)': []
    }
    
    for M in M_values:
        
        print()
        print(f'generating dataset (M={M}) ... ', end='')
        X_train, y_train, X_test, y_test = generate_data_yulesimon(N=N, M=M)
        print(f'train input.shape: {X_train.shape}')
        m = X_train.shape[1]

        for C in C_values:

            for log_scale in train_log_scale:

                if log_scale:
                    X_train = log_sacle(X_train, C=C)
                    X_test = log_sacle(X_test, C=C)
                    print(f'training - log-scale=True, C={C}', end=' ... ')
                else:
                    print(f'training - log-scale=False', end=' ... ')

                

                start_time = time.time()
                dnn_model, history, y_pred, sqrt_mse = dnn_trial(X_train, y_train, X_test, y_test)
                train_time = round(time.time() - start_time)

                # debug print
                loss = np.min(history['loss'])
                val_loss = np.min(history['val_loss'])
                print(f'sqrt-mse: {sqrt_mse:.4f}', end=', ')
                print(f'loss: {loss:.4f}', end=', ')
                print(f'val_loss: {val_loss:.4f}', end=', ')
                print(f'train-time: {round(train_time)} sec.')

                trials_out.append({
                    'model': dnn_model,
                    'history': history,
                    'M': M,
                    'm': m,
                    'y-test': y_test,
                    'y-pred': y_pred,
                    'sqrt-mse': sqrt_mse
                })

                d['M'].append(M)
                d['m'].append(m)
                d['C'].append(C)
                d['sqrt-mse'].append(round(sqrt_mse, 4))
                d['train-time (sec.)'].append(f'{train_time}')

    df = pd.DataFrame(data=d)
    return trials_out, df

In [4]:
trials_out_1, df_1 = Experiment(
    N=1000, 
    M_values=[256, 512, 1024, 2048, 4096, 8192], 
    C_values=[0], 
    train_log_scale=[True, False])
df_1


generating dataset (M=256) ... train input.shape: (750, 511)
training - log-scale=True, C=0 ... sqrt-mse: 0.2227, loss: 0.0068, val_loss: 0.0444, train-time: 7 sec.
training - log-scale=False ... sqrt-mse: 0.2236, loss: 0.0057, val_loss: 0.0584, train-time: 6 sec.

generating dataset (M=512) ... train input.shape: (750, 276)
training - log-scale=True, C=0 ... sqrt-mse: 0.1898, loss: 0.0061, val_loss: 0.0323, train-time: 9 sec.
training - log-scale=False ... sqrt-mse: 0.1986, loss: 0.0054, val_loss: 0.0349, train-time: 7 sec.

generating dataset (M=1024) ... train input.shape: (750, 1300)
training - log-scale=True, C=0 ... sqrt-mse: 0.1456, loss: 0.0044, val_loss: 0.0196, train-time: 13 sec.
training - log-scale=False ... sqrt-mse: 0.1520, loss: 0.0061, val_loss: 0.0215, train-time: 11 sec.

generating dataset (M=2048) ... train input.shape: (750, 656)
training - log-scale=True, C=0 ... sqrt-mse: 0.1134, loss: 0.0052, val_loss: 0.0158, train-time: 7 sec.
training - log-scale=False ... 

Unnamed: 0,M,m,C,sqrt-mse,train-time (sec.)
0,256,511,0,0.2227,7
1,256,511,0,0.2236,6
2,512,276,0,0.1898,9
3,512,276,0,0.1986,7
4,1024,1300,0,0.1456,13
5,1024,1300,0,0.152,11
6,2048,656,0,0.1134,7
7,2048,656,0,0.1076,7
8,4096,1387,0,0.098,13
9,4096,1387,0,0.0864,11


## Log shift experiment
Run an experiment to see which C is better if using the function LOG(1+C)
Try, e.g., C = 0.2, 0.5, 1, 2,

In [5]:
trials_out_2, df_2 = Experiment(
    N=1000, 
    M_values=[256, 512], 
    C_values=[0, 0.2, 0.5, 1, 2], 
    train_log_scale=[True])
df_2


generating dataset (M=256) ... train input.shape: (750, 321)
training - log-scale=True, C=0 ... sqrt-mse: 0.2358, loss: 0.0079, val_loss: 0.0507, train-time: 5 sec.
training - log-scale=True, C=0.2 ... sqrt-mse: 0.2322, loss: 0.0084, val_loss: 0.0659, train-time: 5 sec.
training - log-scale=True, C=0.5 ... sqrt-mse: 0.2488, loss: 0.0076, val_loss: 0.0645, train-time: 5 sec.
training - log-scale=True, C=1 ... sqrt-mse: 0.2612, loss: 0.0085, val_loss: 0.0673, train-time: 6 sec.
training - log-scale=True, C=2 ... sqrt-mse: 0.4926, loss: 0.0173, val_loss: 0.2851, train-time: 4 sec.

generating dataset (M=512) ... train input.shape: (750, 431)
training - log-scale=True, C=0 ... sqrt-mse: 0.1807, loss: 0.0067, val_loss: 0.0317, train-time: 7 sec.
training - log-scale=True, C=0.2 ... sqrt-mse: 0.1841, loss: 0.0056, val_loss: 0.0313, train-time: 7 sec.
training - log-scale=True, C=0.5 ... sqrt-mse: 0.1780, loss: 0.0069, val_loss: 0.0376, train-time: 7 sec.
training - log-scale=True, C=1 ... s

Unnamed: 0,M,m,C,sqrt-mse,train-time (sec.)
0,256,321,0.0,0.2358,5
1,256,321,0.2,0.2322,5
2,256,321,0.5,0.2488,5
3,256,321,1.0,0.2612,6
4,256,321,2.0,0.4926,4
5,512,431,0.0,0.1807,7
6,512,431,0.2,0.1841,7
7,512,431,0.5,0.178,7
8,512,431,1.0,0.2321,6
9,512,431,2.0,0.3728,4


## Possion experiment
Learn Poisson in three different settings:

Use exact values.  
Use Log.  
Use P values, defined from the poisson thingy.  
P(k) = Log(MEASURED(k) * k!)

In [6]:
# TODO

## Bias experiment
Same experiment as usual, but make sure the prediction is unbiased. Average difference should be close to zero

In [8]:
def test_bias(trials, df):
    d = {
        'M': [], 
        'm': [], 
        'sqrt-mse': [],
        'Bias': [],
    }

    for i in range(len(trials)):
        d['M'].append(trials[i]['M'])
        d['m'].append(trials[i]['m'])
        y_test = np.round(trials[i]['y-test'], decimals=2)
        y_pred = np.round(trials[i]['y-pred'], decimals=2)
        avg_diff = np.average(y_test - y_pred)
        d['Bias'].append(np.round(avg_diff, decimals=4))
        d['sqrt-mse'].append(np.round(trials[i]['sqrt-mse'], decimals=4))

    df = pd.DataFrame(data=d)
    return df

df_bias_1 = test_bias(trials=trials_out_1, df=df_1)
df_bias_1

Unnamed: 0,M,m,sqrt-mse,Bias
0,256,511,0.2227,-0.0014
1,256,511,0.2236,-0.0106
2,512,276,0.1898,0.0137
3,512,276,0.1986,0.032
4,1024,1300,0.1456,-0.0279
5,1024,1300,0.152,-0.0242
6,2048,656,0.1134,0.004
7,2048,656,0.1076,-0.0024
8,4096,1387,0.098,0.0152
9,4096,1387,0.0864,0.0
