# Imports

In [1]:
import numpy as np
from scipy import stats
import random

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# import library
%run lib.ipynb

# Generate data (yulesimon)

In [2]:
from scipy.stats import yulesimon

def sample_yulesimon(alpha, size):
    return yulesimon.rvs(alpha, loc=0, size=size)

def next_alpha(min_alpha=2.0, max_alpha=3.0):
    alpha = np.random.uniform(low=min_alpha, high=max_alpha, size=1)[0]
    return alpha

# generate data (yulesimon)
def generate_data_yulesimon(N, M, density=False):
    H, alphas = generate_data(N=N, M=M, nextConfig=next_alpha, sample=sample_yulesimon)

    # split train/test
    # (use train_test_split so the shape of the train/test data will be the same)
    H_train, H_test, y_train, y_test = train_test_split(H, alphas, test_size=0.25)
    
    return H_train, y_train, H_test, y_test

# Experiments

## Log Experiment
Run an experiment to show that using the log function improves something, either accuracy or learning rate, or both.
Use fixed setting, e.g, YS, samples = 256 or 512.
In this experiment use the LOG(1+X) function.

In [3]:
def log_scale(H, C=0):
    # log scale H rows
    # (shift H values by one so as not to take log of zero)
    return np.apply_along_axis(lambda a: np.log10(a + 1 + C), 1, H)

def normalize(H):
    # normalize values (sum to 1)
    return H / H.sum(axis=1, keepdims=1)

def Experiment(M_values, N=10000, C_values=[0], apply_log_scale=[True, False]):

    trials_out = []

    d = {
        'M': [], 
        'm': [],
        'log-scale': [],
        'C': [], 
        'sqrt-mse': [],
        'train-time (sec.)': []
    }
    
    for M in M_values:
        
        print()
        print(f'generating dataset (M={M}) ... ', end='')
        X_train, y_train, X_test, y_test = generate_data_yulesimon(N=N, M=M)
        print(f'train input.shape: {X_train.shape}')
        m = X_train.shape[1]

        for C in C_values:

            for ls in apply_log_scale:

                if ls:
                    X_train = log_scale(X_train, C=C)
                    X_test = log_scale(X_test, C=C)
                    print(f'training - log-scale=True, C={C}', end=' ... ')
                else:
                    print(f'training - log-scale=False', end=' ... ')

                

                start_time = time.time()
                dnn_model, history, y_pred, sqrt_mse = dnn_trial(X_train, y_train, X_test, y_test)
                train_time = round(time.time() - start_time)

                # debug print
                loss = np.min(history['loss'])
                val_loss = np.min(history['val_loss'])
                print(f'sqrt-mse: {sqrt_mse:.4f}', end=', ')
                print(f'loss: {loss:.4f}', end=', ')
                print(f'val_loss: {val_loss:.4f}', end=', ')
                print(f'train-time: {round(train_time)} sec.')

                trials_out.append({
                    'model': dnn_model,
                    'history': history,
                    'M': M,
                    'm': m,
                    'y-test': y_test,
                    'y-pred': y_pred,
                    'sqrt-mse': sqrt_mse
                })

                d['M'].append(M)
                d['m'].append(m)
                d['log-scale'].append(ls)
                d['C'].append(C)
                d['sqrt-mse'].append(round(sqrt_mse, 4))
                d['train-time (sec.)'].append(f'{train_time}')

    df = pd.DataFrame(data=d)
    return trials_out, df

In [4]:
N = 10000
trials_out_1, df_1 = Experiment(
    N=N, 
    M_values=[256, 512, 1024, 2048, 4096, 8192], 
    C_values=[0], 
    apply_log_scale=[True, False])
df_1


generating dataset (M=256) ... train input.shape: (7500, 1387)
training - log-scale=True, C=0 ... sqrt-mse: 0.1986, loss: 0.0262, val_loss: 0.0519, train-time: 27 sec.
training - log-scale=False ... sqrt-mse: 0.2105, loss: 0.0271, val_loss: 0.0502, train-time: 28 sec.

generating dataset (M=512) ... train input.shape: (7500, 1869)
training - log-scale=True, C=0 ... sqrt-mse: 0.1700, loss: 0.0204, val_loss: 0.0349, train-time: 34 sec.
training - log-scale=False ... sqrt-mse: 0.1816, loss: 0.0184, val_loss: 0.0402, train-time: 40 sec.

generating dataset (M=1024) ... train input.shape: (7500, 2396)
training - log-scale=True, C=0 ... sqrt-mse: 0.1337, loss: 0.0139, val_loss: 0.0222, train-time: 44 sec.
training - log-scale=False ... sqrt-mse: 0.1443, loss: 0.0128, val_loss: 0.0245, train-time: 64 sec.

generating dataset (M=2048) ... train input.shape: (7500, 2233)
training - log-scale=True, C=0 ... sqrt-mse: 0.1152, loss: 0.0095, val_loss: 0.0160, train-time: 68 sec.
training - log-scal

Unnamed: 0,M,m,log-scale,C,sqrt-mse,train-time (sec.)
0,256,1387,True,0,0.1986,27
1,256,1387,False,0,0.2105,28
2,512,1869,True,0,0.17,34
3,512,1869,False,0,0.1816,40
4,1024,2396,True,0,0.1337,44
5,1024,2396,False,0,0.1443,64
6,2048,2233,True,0,0.1152,68
7,2048,2233,False,0,0.1005,39
8,4096,2645,True,0,0.0754,71
9,4096,2645,False,0,0.094,65


## Log shift experiment
Run an experiment to see which C is better if using the function LOG(1+C)
Try, e.g., C = 0.2, 0.5, 1, 2,

In [5]:
trials_out_2, df_2 = Experiment(
    N=N, 
    M_values=[256, 512], 
    C_values=[0.0, 0.2, 0.5, 1.0, 2.0], 
    apply_log_scale=[True])
df_2


generating dataset (M=256) ... train input.shape: (7500, 976)
training - log-scale=True, C=0.0 ... sqrt-mse: 0.2102, loss: 0.0270, val_loss: 0.0470, train-time: 21 sec.
training - log-scale=True, C=0.2 ... sqrt-mse: 0.2069, loss: 0.0268, val_loss: 0.0446, train-time: 22 sec.
training - log-scale=True, C=0.5 ... sqrt-mse: 0.2396, loss: 0.0278, val_loss: 0.0602, train-time: 22 sec.
training - log-scale=True, C=1.0 ... sqrt-mse: 0.3645, loss: 0.0272, val_loss: 0.1438, train-time: 36 sec.
training - log-scale=True, C=2.0 ... sqrt-mse: 0.5607, loss: 0.0370, val_loss: 0.3075, train-time: 21 sec.

generating dataset (M=512) ... train input.shape: (7500, 809)
training - log-scale=True, C=0.0 ... sqrt-mse: 0.1663, loss: 0.0195, val_loss: 0.0354, train-time: 20 sec.
training - log-scale=True, C=0.2 ... sqrt-mse: 0.1926, loss: 0.0157, val_loss: 0.0417, train-time: 45 sec.
training - log-scale=True, C=0.5 ... sqrt-mse: 0.1916, loss: 0.0202, val_loss: 0.0423, train-time: 25 sec.
training - log-sca

Unnamed: 0,M,m,log-scale,C,sqrt-mse,train-time (sec.)
0,256,976,True,0.0,0.2102,21
1,256,976,True,0.2,0.2069,22
2,256,976,True,0.5,0.2396,22
3,256,976,True,1.0,0.3645,36
4,256,976,True,2.0,0.5607,21
5,512,809,True,0.0,0.1663,20
6,512,809,True,0.2,0.1926,45
7,512,809,True,0.5,0.1916,25
8,512,809,True,1.0,0.2328,37
9,512,809,True,2.0,0.2982,37


## Possion experiment
Learn Poisson in three different settings:

Use exact values.  
Use Log.  
Use P values, defined from the poisson thingy.  
P(k) = Log(MEASURED(k) * k!)

In [6]:
# TODO

## Bias experiment
Same experiment as usual, but make sure the prediction is unbiased. Average difference should be close to zero

In [7]:
def test_bias(trials, df):
    d = {
        'M': [], 
        'm': [], 
        'sqrt-mse': [],
        'Bias': [],
    }

    for i in range(len(trials)):
        d['M'].append(trials[i]['M'])
        d['m'].append(trials[i]['m'])
        y_test = np.round(trials[i]['y-test'], decimals=2)
        y_pred = np.round(trials[i]['y-pred'], decimals=2)
        avg_diff = np.average(y_test - y_pred)
        d['Bias'].append(np.round(avg_diff, decimals=4))
        d['sqrt-mse'].append(np.round(trials[i]['sqrt-mse'], decimals=4))

    df = pd.DataFrame(data=d)
    return df

df_bias_1 = test_bias(trials=trials_out_1, df=df_1)
df_bias_1

Unnamed: 0,M,m,sqrt-mse,Bias
0,256,1387,0.1986,0.003
1,256,1387,0.2105,-0.0264
2,512,1869,0.17,-0.0155
3,512,1869,0.1816,0.003
4,1024,2396,0.1337,-0.0115
5,1024,2396,0.1443,-0.0001
6,2048,2233,0.1152,0.0189
7,2048,2233,0.1005,-0.0123
8,4096,2645,0.0754,0.0064
9,4096,2645,0.094,0.0182
