# visualize-value-counts

For all the different value COUNTS (freqs), visualize (in log space) to pick a cutoff for values that are used too rarely.

A count of 140 is a good inflection point.

In [None]:
import matplotlib.pyplot as plt
import json
import numpy as np
from collections import defaultdict
import copy

In [None]:
# The MOST important parameter
N_ALGORITHMS = 4

In [None]:
key_value_count = defaultdict(int)

for l in open("uniqmain.json"):
    o = json.loads(l)
    for (k, v) in o.items():
        key_value_count[(k, v)] += 1
        
key_to_count_value = {}
for (k, v) in key_value_count:
    if k not in key_to_count_value:
        key_to_count_value[k] = []
    key_to_count_value[k].append((key_value_count[(k, v)], v))
    
for k in list(key_to_count_value.keys()):
    key_to_count_value[k] = sorted(key_to_count_value[k], reverse=True)
    #print(k, key_to_count_value[k])

In [None]:
# Algorithms, from most to least prevalent
print([v for (cnt, v) in key_to_count_value["ALG"]])

In [None]:
# Algorithm, the most important parameter?

tot = sum([cnt for (cnt, v) in key_to_count_value["ALG"]])

alg_percentile = [cnt/tot for (cnt, v) in key_to_count_value["ALG"]]
#print([(v, (".2f" % (cnt/tot))) for (cnt, v) in key_to_count_value["ALG"]])
print([(v, float(f"{cnt/tot:.3f}")) for (cnt, v) in key_to_count_value["ALG"]])

In [None]:
plt.plot(range(len(alg_percentile)), alg_percentile)

In [None]:
kept_algorithms = [v for (cnt, v) in key_to_count_value["ALG"][:N_ALGORITHMS]]
print("kept algorithms:", kept_algorithms)

In [None]:
# Redo everything, but keep only patches with these algorithms

In [None]:
key_value_count = defaultdict(int)

for l in open("uniqmain.json"):
    o = json.loads(l)
    if o["ALG"] not in kept_algorithms:
        continue
    for (k, v) in o.items():
        key_value_count[(k, v)] += 1
        
key_to_count_value = {}
for (k, v) in key_value_count:
    if k not in key_to_count_value:
        key_to_count_value[k] = []
    key_to_count_value[k].append((key_value_count[(k, v)], v))
    
for k in list(key_to_count_value.keys()):
    key_to_count_value[k] = sorted(key_to_count_value[k], reverse=True)
    #print(k, len(key_to_count_value[k]))

In [None]:
vals = key_value_count.values()
vals = sorted(vals)

In [None]:
qs = []
kv_tot_possibles = []
for q in range(0, 100, 1):
    threshold = np.percentile(vals, q)
    #print(q, threshold)
    this_key_to_count_value = copy.deepcopy(key_to_count_value)
    tot_possible = 1
    for k in list(key_to_count_value.keys()):
        this_key_to_count_value[k] = key_to_count_value[k][:1] + [(cnt, v) for (cnt, v) in key_to_count_value[k][1:] if cnt >= threshold]
        #print("\t", k, len(this_key_to_count_value[k]))
        tot_possible *= len(this_key_to_count_value[k])
    qs.append(q)
    kv_tot_possibles.append(tot_possible)

In [None]:
plt.plot(qs, kv_tot_possibles)
plt.yscale("log")
plt.ylabel("Possible patches")
plt.xlabel("(param, value) percentile")

In [None]:
qs = []
v_tot_possibles = []
for q in range(0, 100, 1):
    threshold = np.percentile(vals, q)
    #print(q, threshold)
    this_key_to_count_value = copy.deepcopy(key_to_count_value)
    tot_possible = 1
    for k in list(key_to_count_value.keys()):
        cnttot = sum([cnt for (cnt, v) in key_to_count_value[k]])
        this_key_to_count_value[k] = key_to_count_value[k][:1] + [(cnt, v) for (cnt, v) in key_to_count_value[k][1:] if (cnt/cnttot*100) >= threshold]
        #print("\t", k, len(this_key_to_count_value[k]))
        tot_possible *= len(this_key_to_count_value[k])
    qs.append(q)
    v_tot_possibles.append(tot_possible)

In [None]:
plt.plot(qs, v_tot_possibles)
plt.yscale("log")
plt.ylabel("Possible patches")
plt.xlabel("value percentile by param")

In [None]:
plt.plot(qs, v_tot_possibles, label="value threshold")
plt.plot(qs, kv_tot_possibles, label="(param, value) threshold")

plt.xlabel("percentile")
plt.yscale("log")
plt.ylabel("Possible patches")
#plt.xlabel("value percentile by param")


# Set the x-range
#plt.xlim(50, 100)

plt.legend()

# Show the plot
plt.show()


# Clear the plot
plt.clf()

In [None]:
plt.yscale("log")
plt.plot(range(len(vals)), vals)

In [None]:
coeffs = np.polyfit(range(len(vals)), np.log(np.array(vals)), 2)
# Evaluate the fitted curve at the original x values
fitted_curve = np.exp(np.polyval(coeffs, range(len(vals))))
#plt.yscale("log")
plt.yscale("log")
plt.plot(range(len(vals)), fitted_curve)

In [None]:
# Differentiate the polynomial
derivative_coeffs = np.polyder(coeffs)

# Compute the slope (derivative) at each x value
slopes = np.polyval(derivative_coeffs, range(len(vals)))

In [None]:
plt.plot(slopes)

In [None]:
# Inflection point %
np.argmin(slopes) / len(slopes)

In [None]:
# Inflection point
vals[np.argmin(slopes)]