In [7]:
import os
import time
import numpy as np
from scipy.stats import yulesimon
import matplotlib.pyplot as plt
import pandas as pd

In [12]:
%%time

# --------------------------------------------------------------------------------------
# Generate trainig dataset for predicting the alpha parameter of yulesimon distribution.
# The training dataset contains 'num_alpha' files, each with num_rows (samples) per file.
# Each row contains random_variate_size samples drawn from the yulesimon distribution.
# Each file correspons to a different value of alpha.
# (rows of same file corresponds to the same alpha).
# --------------------------------------------------------------------------------------

if not os.path.exists('data'):
    os.makedirs('data')

MIN_ALPHA = 2.01
MAX_ALPHA = 3.00

# number of files generated (each with a different alpha)
num_alphas = 10 # (between 2.01 and 3.00 inclusive)

# number of raws (samples) in each file
num_rows = 100

# number of RV samples per row
random_variate_size = 10000

# fix loc at zero
loc = 0

for alpha in np.linspace(MIN_ALPHA, MAX_ALPHA, num=num_alphas):
    alpha = round(alpha, 2)

    # init placeholders for samples and corresponding alphas
    samples = np.empty((0, random_variate_size), int)
    alphas = np.empty(random_variate_size, float)
    
    # generate samples (rows) for current alpha
    for i in range(num_rows):
        z = yulesimon.rvs(alpha, loc=loc, size=random_variate_size, random_state=None)
        samples = np.append(samples, [z], axis=0)
        alphas = np.append(alphas, alpha)

    # write samples to a csv file
    DATA_FILE_PREFIX = 'yulesimon'
    data_path = 'data/{}_alpha={:.2f}_loc={}_size={}.csv'.format(
        DATA_FILE_PREFIX, alpha, loc, random_variate_size)
    df = pd.DataFrame(samples)
    df.insert(0, 'alpha', alpha) # firt column is corresponding alpha
    df.to_csv(data_path, header=None, index=None)

CPU times: user 3.47 s, sys: 44 ms, total: 3.52 s
Wall time: 3.52 s
