# Imports

In [1]:
import numpy as np
from scipy import stats
import random

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# import library
%run lib.ipynb

# Generate data (yulesimon)

In [2]:
from scipy.stats import yulesimon

def sample_yulesimon(alpha, size):
    return yulesimon.rvs(alpha, loc=0, size=size)

def next_alpha(min_alpha=2.0, max_alpha=3.0):
    alpha = np.random.uniform(low=min_alpha, high=max_alpha, size=1)[0]
    return alpha

# generate data (yulesimon)
def generate_data_yulesimon(N, M, density=False):
    H, alphas = generate_data(N=N, M=M, nextConfig=next_alpha, sample=sample_yulesimon)

    # split train/test
    # (use train_test_split so the shape of the train/test data will be the same)
    H_train, H_test, y_train, y_test = train_test_split(H, alphas, test_size=0.25)
    
    return H_train, y_train, H_test, y_test

# Experiments

## Log Experiment
Run an experiment to show that using the log function improves something, either accuracy or learning rate, or both.
Use fixed setting, e.g, YS, samples = 256 or 512.
In this experiment use the LOG(1+X) function.

In [3]:
def log_sacle(H, C=0):
    # log scale H rows
    # (shift H values by one so as not to take log of zero)
    return np.apply_along_axis(lambda a: np.log10(a + 1 + C), 1, H)

In [4]:
def normalize(H):
    # normalize values (sum to 1)
    return H / H.sum(axis=1, keepdims=1)

def Experiment_1(M_values, N=10000):

    trials_out = []

    d = {
        'M': [], 
        'm': [],
        'log-scale': [],
        'sqrt-mse': [],
        'train-time (sec.)': []
    }
    
    for M in M_values:
        
        print()
        print(f'generating dataset (M: {M}) ... ', end='')
        X_train, y_train, X_test, y_test = generate_data_yulesimon(N=N, M=M)
        print(f'train input.shape: {X_train.shape}')
        m = X_train.shape[1]

        
        #for density in [True, False]:
        for density in [False]:

            if density:
                X_train = normalize(X_train)
                X_test = normalize(X_test)

            for log_scale in [True, False]:

                if log_scale:
                    X_train = log_sacle(X_train)
                    X_test = log_sacle(X_test)

                print(f'training - log-scale: {log_scale}', end=' ... ')

                start_time = time.time()
                dnn_model, history, y_pred, sqrt_mse = dnn_trial(X_train, y_train, X_test, y_test)
                train_time = round(time.time() - start_time)
                
                # debug print
                loss = np.min(history['loss'])
                val_loss = np.min(history['val_loss'])
                print(f'sqrt-mse: {sqrt_mse:.3f}', end=', ')
                print(f'loss: {loss:.3f}', end=', ')
                print(f'val_loss: {val_loss:.3f}', end=', ')
                print(f'train-time: {round(train_time)} sec.')

                trials_out.append({
                    'model': dnn_model,
                    'history': history,
                    'y_test': y_test,
                    'y_pred': y_pred,
                    'sqrt-mse': sqrt_mse
                })
                
                d['M'].append(M)
                d['m'].append(m)
                d['log-scale'].append(log_scale)
                d['sqrt-mse'].append(round(sqrt_mse, 3))
                d['train-time (sec.)'].append(f'{train_time}')

    df = pd.DataFrame(data=d)
    return trials_out, df

In [5]:
trials_out, df = Experiment_1(M_values=[256, 512, 1024, 2048, 4096, 8192])
df


generating dataset (M: 256) ... train input.shape: (7500, 2393)
training - log-scale: True ... sqrt-mse: 0.213, loss: 0.027, val_loss: 0.049, train-time: 40 sec.
training - log-scale: False ... sqrt-mse: 0.209, loss: 0.026, val_loss: 0.049, train-time: 42 sec.

generating dataset (M: 512) ... train input.shape: (7500, 2549)
training - log-scale: True ... sqrt-mse: 0.173, loss: 0.019, val_loss: 0.039, train-time: 45 sec.
training - log-scale: False ... sqrt-mse: 0.172, loss: 0.020, val_loss: 0.036, train-time: 41 sec.

generating dataset (M: 1024) ... train input.shape: (7500, 1872)
training - log-scale: True ... sqrt-mse: 0.137, loss: 0.015, val_loss: 0.024, train-time: 43 sec.
training - log-scale: False ... sqrt-mse: 0.128, loss: 0.014, val_loss: 0.022, train-time: 32 sec.

generating dataset (M: 2048) ... train input.shape: (7500, 2062)
training - log-scale: True ... sqrt-mse: 0.107, loss: 0.010, val_loss: 0.015, train-time: 55 sec.
training - log-scale: False ... sqrt-mse: 0.106, 

Unnamed: 0,M,m,log-scale,sqrt-mse,train-time (sec.)
0,256,2393,True,0.213,40
1,256,2393,False,0.209,42
2,512,2549,True,0.173,45
3,512,2549,False,0.172,41
4,1024,1872,True,0.137,43
5,1024,1872,False,0.128,32
6,2048,2062,True,0.107,55
7,2048,2062,False,0.106,57
8,4096,5106,True,0.086,136
9,4096,5106,False,0.083,117


## Log shift experiment
Run an experiment to see which C is better if using the function LOG(1+C)
Try, e.g., C = 0.2, 0.5, 1, 2,

In [6]:
def Experiment_2(M_values, C_values):

    trials_out = []

    d = {
        'M': [], 
        'm': [],
        'C': [], 
        'sqrt-mse': [],
        'train-time (sec.)': []
    }
    
    N = 10000
    
    for M in M_values:
        
        print()
        print(f'generating dataset (M: {M}) ... ', end='')
        X_train, y_train, X_test, y_test = generate_data_yulesimon(N=N, M=M)
        print(f'train input.shape: {X_train.shape}')
        m = X_train.shape[1]

        for C in C_values:

            # log_scale
            X_train = log_sacle(X_train, C=C)
            X_test = log_sacle(X_test, C=C)

            print(f'training (C={C}) ... ', end='')

            start_time = time.time()
            dnn_model, history, y_pred, sqrt_mse = dnn_trial(X_train, y_train, X_test, y_test)
            train_time = round(time.time() - start_time)

            # debug print
            loss = np.min(history['loss'])
            val_loss = np.min(history['val_loss'])
            print(f'sqrt-mse: {sqrt_mse:.3f}', end=', ')
            print(f'loss: {loss:.3f}', end=', ')
            print(f'val_loss: {val_loss:.3f}', end=', ')
            print(f'train-time: {round(train_time)} sec.')

            trials_out.append({
                'model': dnn_model,
                'history': history,
                'y_test': y_test,
                'y_pred': y_pred,
                'sqrt-mse': sqrt_mse
            })

            d['M'].append(M)
            d['m'].append(m)
            d['C'].append(C)
            d['sqrt-mse'].append(round(sqrt_mse, 3))
            d['train-time (sec.)'].append(f'{train_time}')

    df = pd.DataFrame(data=d)
    return trials_out, df

trials_out, df = Experiment_2(M_values=[256, 512], C_values=[0, 0.2, 0.5, 1, 2])
df


generating dataset (M: 256) ... train input.shape: (7500, 3629)
training (C=0.2) ... sqrt-mse: 0.217, loss: 0.025, val_loss: 0.055, train-time: 71 sec.
training (C=0.5) ... sqrt-mse: 0.269, loss: 0.023, val_loss: 0.080, train-time: 95 sec.
training (C=1) ... sqrt-mse: 0.271, loss: 0.030, val_loss: 0.071, train-time: 54 sec.
training (C=2) ... sqrt-mse: 0.365, loss: 0.038, val_loss: 0.137, train-time: 52 sec.

generating dataset (M: 512) ... train input.shape: (7500, 1255)
training (C=0.2) ... sqrt-mse: 0.181, loss: 0.020, val_loss: 0.040, train-time: 27 sec.
training (C=0.5) ... sqrt-mse: 0.211, loss: 0.020, val_loss: 0.049, train-time: 26 sec.
training (C=1) ... sqrt-mse: 0.227, loss: 0.021, val_loss: 0.075, train-time: 27 sec.
training (C=2) ... sqrt-mse: 0.475, loss: 0.025, val_loss: 0.237, train-time: 43 sec.


Unnamed: 0,M,m,C,sqrt-mse,train-time (sec.)
0,256,3629,0.2,0.217,71
1,256,3629,0.5,0.269,95
2,256,3629,1.0,0.271,54
3,256,3629,2.0,0.365,52
4,512,1255,0.2,0.181,27
5,512,1255,0.5,0.211,26
6,512,1255,1.0,0.227,27
7,512,1255,2.0,0.475,43


## Possion experiment
Learn Poisson in three different settings:

Use exact values.  
Use Log.  
Use P values, defined from the poisson thingy.  
P(k) = Log(MEASURED(k) * k!)

In [7]:
# TODO

## Bias experiment
Same experiment as usual, but make sure the prediction is unbiased. Average difference should be close to zero

In [8]:
# TODO