# Imports

In [1]:
import numpy as np
from scipy import stats
import random

import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

# import library
%run lib.ipynb

# Generate data (yulesimon)

In [2]:
from scipy.stats import yulesimon

def sample_yulesimon(alpha, size):
    return yulesimon.rvs(alpha, loc=0, size=size)

def next_alpha(min_alpha=2.0, max_alpha=3.0):
    alpha = np.random.uniform(low=min_alpha, high=max_alpha, size=1)[0]
    return alpha

# generate data (yulesimon)
def generate_data_yulesimon(N, M, density=False):
    H, alphas = generate_data(N=N, M=M, nextConfig=next_alpha, sample=sample_yulesimon)

    # split train/test
    # (use train_test_split so the shape of the train/test data will be the same)
    H_train, H_test, y_train, y_test = train_test_split(H, alphas, test_size=0.25)
    
    return H_train, y_train, H_test, y_test

# Experiments

## Log Experiment
Run an experiment to show that using the log function improves something, either accuracy or learning rate, or both.
Use fixed setting, e.g, YS, samples = 256 or 512.
In this experiment use the LOG(1+X) function.

In [5]:
def normalize(H):
    # normalize values (sum to 1)
    return H / H.sum(axis=1, keepdims=1)

def log_sacle(H):
    # log scale H rows
    # (shift H values by one so as not to take log of zero)
    return np.apply_along_axis(lambda a: np.log10(a), 1, H + 1)

def Experiment_1(M_values, N=10000):

    trials_out = []

    d = {
        'M': [], 
        'm': [],
        'log-scale': [],
        'sqrt-mse': [],
        'train-time (sec.)': []
    }
    
    for M in M_values:
        
        print()
        print(f'generating dataset (M: {M}) ... ', end='')
        X_train, y_train, X_test, y_test = generate_data_yulesimon(N=N, M=M)
        print(f'train input.shape: {X_train.shape}')
        m = X_train.shape[1]

        
        #for density in [True, False]:
        for density in [False]:

            if density:
                X_train = normalize(X_train)
                X_test = normalize(X_test)

            for log_scale in [True, False]:

                if log_scale:
                    X_train = log_sacle(X_train)
                    X_test = log_sacle(X_test)

                print(f'training - log-scale: {log_scale}', end=' ... ')

                start_time = time.time()
                dnn_model, history, y_pred, sqrt_mse = dnn_trial(X_train, y_train, X_test, y_test)
                train_time = round(time.time() - start_time)
                
                # debug print
                loss = np.min(history['loss'])
                val_loss = np.min(history['val_loss'])
                print(f'sqrt-mse: {sqrt_mse:.3f}', end=', ')
                print(f'loss: {loss:.3f}', end=', ')
                print(f'val_loss: {val_loss:.3f}', end=', ')
                print(f'train-time: {round(train_time)} sec.')

                trials_out.append({
                    'model': dnn_model,
                    'history': history,
                    'y_test': y_test,
                    'y_pred': y_pred,
                    'sqrt-mse': sqrt_mse
                })
                
                d['M'].append(M)
                d['m'].append(m)
                d['log-scale'].append(log_scale)
                d['sqrt-mse'].append(round(sqrt_mse, 3))
                d['train-time (sec.)'].append(f'{train_time}')

    df = pd.DataFrame(data=d)
    
    return trials_out, df

In [None]:
trials_out, df = Experiment_1(M_values = [256, 512, 1024, 2048, 4096, 8192, 16384])
df

## Log shift experiment
Run an experiment to see which C is better if using the function LOG(1+C)
Try, e.g., C = 0.2, 0.5, 1, 2,

In [None]:
# TODO

## Possion experiment
Learn Poisson in three different settings:

Use exact values.  
Use Log.  
Use P values, defined from the poisson thingy.  
P(k) = Log(MEASURED(k) * k!)

In [None]:
# TODO

## Bias experiment
Same experiment as usual, but make sure the prediction is unbiased. Average difference should be close to zero

In [None]:
# TODO