# Rough Calculation for Number of Open Parentheses in Training Data

In [33]:
import scipy.stats as stats
import numpy as np

In [20]:
def generate_zipfian_integer(n, a):
    """
    Generate integer number between 1 and n (inclusive) from a Zipf's power law distribution.

    Parameters:
    n (int): The upper limit (inclusive) for the range of integers.
    a (float): The parameter of the Zipfian distribution (a >= 0).

    Returns:
    int: An integer between 0 and n.
    """
    # Generate samples from a Zipfian distribution
    sample = stats.zipfian.rvs(a,n, size=1)

    return sample[0]

## Average number of systems in a trace

In [19]:
# mean of a zipfian distribution
a = 1.5 # parameter of zipfian distribution
n = 25 # maximum value of zipfian distribution
avg_sys_in_trace = stats.zipfian.mean(a, n) # average number of systems in a trace
print(f"average number of systems in a trace: {avg_sys_in_trace}")

average number of systems in a trace: 3.8980166787247157


## Average length of a trace segment

In [32]:
sys_in_trace = generate_zipfian_integer(25, 1.5)  # generate a random number from zipfian distribution

sys_in_trace = 1
print(f"number of systems in a trace: {sys_in_trace}")

# create a random generator
rng = np.random.default_rng()

# generate a sample from a poisson dist and name it num_cut
lam = 2*sys_in_trace

num_cut = rng.poisson(lam) # number of cuts in the trace

n_positions = 251
positions = rng.integers(0, n_positions, size=num_cut) #positions are the index of the closed paren (and start token)
if not 0 in positions:
    positions = np.append(positions, [0,251])
positions.sort() # sort the positions in ascending order

# calculate the differenc between successive positions
diffs = np.diff(positions)

# calculate the real segment lengths
real_seg_lens = diffs - 2

print(f"number of cuts: {num_cut}")
print(f"positions of cuts: {positions}")
print(f"differences between successive positions: {diffs}")
print(f"real segment lengths: {real_seg_lens}")

number of systems in a trace: 1
number of cuts: 0
positions of cuts: [  0 251]
differences between successive positions: [251]
real segment lengths: [249]


In [29]:
n_bin = 249 
min_scale = 1
max_scale = 10

randscale = np.random.rand(min_scale,max_scale) # random scale factor
print(f"random scale: {randscale}")

p_bin = 1/(randscale*avg_sys_in_trace) # parameter of binomial distribution

avg_seg_len = n_bin*p_bin # average length of a trace segment
print(f"rough average length of a trace segment: {avg_seg_len}")

random scale: [[0.23275732 0.53472905 0.2189204  0.98314944 0.39680156 0.69281146
  0.3025254  0.81832016 0.80505872 0.12645733]]
rough average length of a trace segment: [[274.44309049 119.45982561 291.78934415  64.97347848 160.98384123
   92.20205243 211.15132435  78.06069285  79.34655851 505.13988373]]


## Average # of cuts in a trace

In [7]:
avg_num_cuts = 251/avg_seg_len - 1 # average number of cuts in the trace
print(f"average number of cuts in a trace: {avg_num_cuts}")

num_traces = 2*84000 #number of traces seen during training

print(f"3 x avg number of cuts x number of traces: {3*avg_num_cuts*num_traces}")

average number of cuts in a trace: 38.29326049638167
3 x avg number of cuts x number of traces: 19299803.29017636
