In [None]:
import json
import numpy as np
from matplotlib import pyplot as plt
from scipy.optimize import curve_fit
import os
from tqdm import tqdm
from scipy.optimize import fsolve

In [None]:
model_sizes = ["0.6B", "1.7B", "4B", "8B", "14B", "32B"]
alphas = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
drafts = [1, 2, 3, 4, 5, 6, 8, 10]
parameters = {0:0.76, 1:2, 2:4.4, 3:8, 4:14, 5:32}
h = {0:1024, 1:2048, 2:2560, 3:4096, 4:5120, 5:5120}
nlayers = {0:28, 1:28, 2:36, 3:36, 4:40, 5:64}
DATA_DIR = "."

In [None]:
def getTTFT(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data)):
        # print(data['benchmarks'][i]['metrics'].keys())
        itl.append(data[i]['time_to_first_token_ms']['successful']['mean'])
        rps.append((data[i]['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

def getITL(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data)):
        # print(data['benchmarks'][i]['metrics'].keys())
        itl.append(data[i]['inter_token_latency_ms']['successful']['mean'])
        rps.append((data[i]['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

def getRequestLatency(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data)):
        # print(data['benchmarks'][i]['metrics']['request_latency']['successful'].keys())
        itl.append(data[i]['request_latency']['successful']['mean'])
        rps.append((data[i]['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

def getTPOT(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data)):
        # print(data['benchmarks'][i]['metrics'].keys())
        itl.append(data[i]['time_per_output_token_ms']['successful']['mean'])
        rps.append((data[i]['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]



In [None]:
def fit_latency_from_RPS(path, label=None, debug=False):
    baseline_rps, baseline_itl = getRequestLatency(path)
    # sort the baseline_itl indices
    indx = np.argsort(baseline_itl)
    # remove the top 5 entries for both baseline_rps and baseline_itl
    # but do not change the order otherwise
    indx = indx[:-3]
    idnx = np.sort(indx)
    if debug:
        print(idnx)
    baseline_rps = baseline_rps[idnx]
    baseline_itl = baseline_itl[idnx]
    def model(x, c1, c2):
        return c1 / (1 - c2 * x)
    mask = baseline_itl != 0
    A = np.vstack([np.ones_like(baseline_rps[mask]), baseline_rps[mask]]).T
    z = 1.0 / baseline_itl[mask]
    a, b = np.linalg.lstsq(A, z, rcond=None)[0]   # z ≈ a + b x
    c1_0 = 1.0 / a
    c2_0 = -b / a
    popt, pcov = curve_fit(model, baseline_rps, baseline_itl, p0=[c1_0, c2_0], maxfev=10000)
    # plt plot the data and fit and return the fit parameters
    if debug:
        plt.scatter(baseline_rps, baseline_itl, label='Measured ITL', color='blue')
        plt.xlabel("RPS")
        plt.ylabel("ITL")
        if label is not None:
            plt.title(f"Modeled vs Measured ITL, {label}")
        else:
            plt.title("Modeled vs Measured ITL")
        plt.plot(baseline_rps, model(baseline_rps, *popt), label='Modeled ITL', color='red')
        plt.legend()

        # Add the popt parameters to the plot in the form of the formula
        plt.text(0.05, 0.95, f'ITL = {popt[0]:.2f} / (1 - {popt[1]:.2f} * RPS)', 
                transform=plt.gca().transAxes, verticalalignment='top',
                color='black')
        
        plt.show()
    return popt
    

In [None]:
def fit_ITL_from_RPS(path, label=None, debug=False):
    baseline_rps, baseline_itl = getITL(path)
    # sort the baseline_itl indices
    indx = np.argsort(baseline_itl)
    # remove the top 5 entries for both baseline_rps and baseline_itl
    # but do not change the order otherwise
    indx = indx[:-2]
    idnx = np.sort(indx)
    if debug:
        print(idnx)
    baseline_rps = baseline_rps[idnx]
    baseline_itl = baseline_itl[idnx]
    def model(x, c1, c2):
        return c1 / (1 - c2 * x)
    mask = baseline_itl != 0
    A = np.vstack([np.ones_like(baseline_rps[mask]), baseline_rps[mask]]).T
    z = 1.0 / baseline_itl[mask]
    a, b = np.linalg.lstsq(A, z, rcond=None)[0]   # z ≈ a + b x
    c1_0 = 1.0 / a
    c2_0 = -b / a
    popt, pcov = curve_fit(model, baseline_rps, baseline_itl, p0=[c1_0, c2_0], maxfev=10000)
    # plt plot the data and fit and return the fit parameters
    if debug:
        plt.scatter(baseline_rps, baseline_itl, label='Measured ITL', color='blue')
        plt.xlabel("RPS")
        plt.ylabel("ITL")
        if label is not None:
            plt.title(f"Modeled vs Measured ITL, {label}")
        else:
            plt.title("Modeled vs Measured ITL")
        plt.plot(baseline_rps, model(baseline_rps, *popt), label='Modeled ITL', color='red')
        plt.legend()

        # Add the popt parameters to the plot in the form of the formula
        plt.text(0.05, 0.95, f'ITL = {popt[0]:.2f} / (1 - {popt[1]:.2f} * RPS)', 
                transform=plt.gca().transAxes, verticalalignment='top',
                color='black')
        
        plt.show()
    return popt
    

In [None]:
DATA_DIR = "/proving-grounds/machine/linghao/speculators/throughput-reduced/"


In [None]:
color_mapping = {
    "0.6B": "red",
    "1.7B": "blue",
    "4B": "green",
    "8B": "yellow",
    "14B": "purple",
    "32B": "orange"
}

In [None]:
Qwen_c1 = []
Qwen_c2 = []

for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_prompt-512_output-128_concurrency-128_reduced.json"
            fit = fit_latency_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}")
            c1[i, j] = fit[0]
            c2[i, j] = fit[1]
            curr_c1 = fit[0]
            curr_c2 = fit[1]
            baseline_rps, baseline_itl = getRequestLatency(path)
            # sort the baseline_itl indices
            indx = np.argsort(baseline_itl)
            # remove the top 5 entries for both baseline_rps and baseline_itl
            # but do not change the order otherwise
            indx = indx[:-2]
            idnx = np.sort(indx)
            baseline_rps = baseline_rps[idnx]
            baseline_itl = baseline_itl[idnx]
            # plt.plot(baseline_itl / curr_c1, baseline_rps * curr_c2, color = color_mapping[model_size], alpha = 0.2)
            plt.scatter(baseline_rps * curr_c2, baseline_itl / curr_c1, color = color_mapping[model_size], alpha = 0.2 , s = 3)

    x = np.linspace(0, 0.9, 100)
    # make a y = 1 / (1 - x)
    y = 1 / (1 - x)
    plt.plot(x, y, color = 'black')
    plt.title(f"Modeled vs Measured ITL, {model_size}")
    plt.show()



    Qwen_c1.append(c1)
    Qwen_c2.append(c2)



In [None]:
Qwen_c1 = []
Qwen_c2 = []

for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_prompt-512_output-128_concurrency-128_reduced.json"
            fit = fit_latency_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}")
            c1[i, j] = fit[0]
            c2[i, j] = fit[1]
            curr_c1 = fit[0]
            curr_c2 = fit[1]
            baseline_rps, baseline_itl = getRequestLatency(path)
            # sort the baseline_itl indices
            indx = np.argsort(baseline_itl)
            # remove the top 5 entries for both baseline_rps and baseline_itl
            # but do not change the order otherwise
            indx = indx[:-2]
            idnx = np.sort(indx)
            baseline_rps = baseline_rps[idnx]
            baseline_itl = baseline_itl[idnx]
            # plt.plot(baseline_itl / curr_c1, baseline_rps * curr_c2, color = color_mapping[model_size], alpha = 0.2)
            plt.scatter(baseline_rps * curr_c2, baseline_itl / curr_c1, color = color_mapping[model_size], alpha = 0.2, s = 3)


    Qwen_c1.append(c1)
    Qwen_c2.append(c2)















def getTTFT(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data['benchmarks'])):
        # print(data['benchmarks'][i]['metrics'].keys())
        itl.append(data['benchmarks'][i]['metrics']['time_to_first_token_ms']['successful']['mean'])
        rps.append((data['benchmarks'][i]['metrics']['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

def getITL(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data['benchmarks'])):
        # print(data['benchmarks'][i]['metrics'].keys())
        itl.append(data['benchmarks'][i]['metrics']['inter_token_latency_ms']['successful']['mean'])
        rps.append((data['benchmarks'][i]['metrics']['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

def getRequestLatency(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data['benchmarks'])):
        # print(data['benchmarks'][i]['metrics']['request_latency']['successful'].keys())
        itl.append(data['benchmarks'][i]['metrics']['request_latency']['successful']['mean'] * 1000)
        rps.append((data['benchmarks'][i]['metrics']['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

def getTPOT(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data['benchmarks'])):
        # print(data['benchmarks'][i]['metrics'].keys())
        itl.append(data['benchmarks'][i]['metrics']['time_per_output_token_ms']['successful']['mean'])
        rps.append((data['benchmarks'][i]['metrics']['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

model_sizes = ["30B-A3B"]
alphas = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
drafts = [1, 2, 3, 4, 5, 6, 8, 10]
parameters = {0:0.76, 1:2, 2:4.4, 3:8, 4:14, 5:32}
h = {0:1024, 1:2048, 2:2560, 3:4096, 4:5120, 5:5120}
nlayers = {0:28, 1:28, 2:36, 3:36, 4:40, 5:64}
DATA_DIR = "."
DATA_DIR = "/proving-grounds/machine/linghao/speculators/throughput"

Qwen_c1 = []
Qwen_c2 = []

for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_experts-8.json"
            if os.path.exists(path):
                fit = fit_latency_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}")
                c1[i, j] = fit[0]
                c2[i, j] = fit[1]
                curr_c1 = fit[0]
                curr_c2 = fit[1]
                baseline_rps, baseline_itl = getRequestLatency(path)
                # sort the baseline_itl indices
                indx = np.argsort(baseline_itl)
                # remove the top 5 entries for both baseline_rps and baseline_itl
                # but do not change the order otherwise
                indx = indx[:-2]
                idnx = np.sort(indx)
                baseline_rps = baseline_rps[idnx]
                baseline_itl = baseline_itl[idnx]
                # plt.plot(baseline_itl / curr_c1, baseline_rps * curr_c2, color = 'black', alpha = 0.2)
                plt.scatter(baseline_rps * curr_c2, baseline_itl / curr_c1, color = 'black', alpha = 1, s = 3, marker = 'x')


    Qwen_c1.append(c1)
    Qwen_c2.append(c2)


# limit the y axis from 0 to 0.025
# plt.xlim(0, 0.025)
# plt.show()

x = np.linspace(0, 0.9, 100)
# make a y = 1 / (1 - x)
y = 1 / (1 - x)
plt.plot(x, y, color = 'black')
plt.show()


In [None]:
def fit_latency_from_RPS(path, label=None, debug=False):
    baseline_rps, baseline_itl = getRequestLatency(path)
    # sort the baseline_itl indices
    indx = np.argsort(baseline_itl)
    # remove the top 5 entries for both baseline_rps and baseline_itl
    # but do not change the order otherwise
    indx = indx[:-1]
    idnx = np.sort(indx)
    if debug:
        print(idnx)
    baseline_rps = baseline_rps[idnx]
    baseline_itl = baseline_itl[idnx]
    def model(x, c1, c2):
        return c1 / (1 - c2 * x)
    mask = baseline_itl != 0
    A = np.vstack([np.ones_like(baseline_rps[mask]), baseline_rps[mask]]).T
    z = 1.0 / baseline_itl[mask]
    a, b = np.linalg.lstsq(A, z, rcond=None)[0]   # z ≈ a + b x
    c1_0 = 1.0 / a
    c2_0 = -b / a
    popt, pcov = curve_fit(model, baseline_rps, baseline_itl, p0=[c1_0, c2_0], maxfev=10000)
    # plt plot the data and fit and return the fit parameters
    if debug:
        plt.scatter(baseline_rps, baseline_itl, label='Measured ITL', color='blue')
        plt.xlabel("RPS")
        plt.ylabel("ITL")
        if label is not None:
            plt.title(f"Modeled vs Measured ITL, {label}")
        else:
            plt.title("Modeled vs Measured ITL")
        plt.plot(baseline_rps, model(baseline_rps, *popt), label='Modeled ITL', color='red')
        plt.legend()

        # Add the popt parameters to the plot in the form of the formula
        plt.text(0.05, 0.95, f'ITL = {popt[0]:.2f} / (1 - {popt[1]:.2f} * RPS)', 
                transform=plt.gca().transAxes, verticalalignment='top',
                color='black')
        
        plt.show()
    return popt
    

In [None]:
for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_experts-32.json"
            if os.path.exists(path):
                fit = fit_latency_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}", debug = True)
                c1[i, j] = fit[0]
                c2[i, j] = fit[1]
                curr_c1 = fit[0]
                curr_c2 = fit[1]
                # baseline_rps, baseline_itl = getRequestLatency(path)
                # sort the baseline_itl indices
                # indx = np.argsort(baseline_itl)
                # remove the top 5 entries for both baseline_rps and baseline_itl
                # but do not change the order otherwise
                # indx = indx[:-2]
                # idnx = np.sort(indx)
                # baseline_rps = baseline_rps[idnx]
                # baseline_itl = baseline_itl[idnx]
                # plt.plot(baseline_itl / curr_c1, baseline_rps * curr_c2, color = 'black', alpha = 0.2)
                # plt.scatter(baseline_rps * curr_c2, baseline_itl / curr_c1, color = 'black', alpha = 1, s = 3, marker = 'x')


In [None]:
def getTTFT(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data['benchmarks'])):
        # print(data['benchmarks'][i]['metrics'].keys())
        itl.append(data['benchmarks'][i]['metrics']['time_to_first_token_ms']['successful']['mean'])
        rps.append((data['benchmarks'][i]['metrics']['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

def getITL(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data['benchmarks'])):
        # print(data['benchmarks'][i]['metrics'].keys())
        itl.append(data['benchmarks'][i]['metrics']['inter_token_latency_ms']['successful']['mean'])
        rps.append((data['benchmarks'][i]['metrics']['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

def getRequestLatency(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data['benchmarks'])):
        # print(data['benchmarks'][i]['metrics']['request_latency']['successful'].keys())
        itl.append(data['benchmarks'][i]['metrics']['request_latency']['successful']['mean'] * 1000)
        rps.append((data['benchmarks'][i]['metrics']['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]

def getTPOT(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    itl=[]
    rps=[]
    # print(data['benchmarks'][0]['metrics'].keys())
    for i in range(len(data['benchmarks'])):
        # print(data['benchmarks'][i]['metrics'].keys())
        itl.append(data['benchmarks'][i]['metrics']['time_per_output_token_ms']['successful']['mean'])
        rps.append((data['benchmarks'][i]['metrics']['requests_per_second']['successful']['mean']))
    itl, rps=np.array(itl), np.array(rps)
    # print(itl)
    # print(rps)
    indx=np.argsort(rps)

    return  rps[indx],itl[indx]



In [None]:
model_sizes = ["30B-A3B"]
alphas = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
drafts = [1, 2, 3, 4, 5, 6, 8, 10]
parameters = {0:0.76, 1:2, 2:4.4, 3:8, 4:14, 5:32}
h = {0:1024, 1:2048, 2:2560, 3:4096, 4:5120, 5:5120}
nlayers = {0:28, 1:28, 2:36, 3:36, 4:40, 5:64}
DATA_DIR = "."

In [None]:
DATA_DIR = "/proving-grounds/machine/linghao/speculators/throughput"

In [None]:
Qwen_c1 = []
Qwen_c2 = []

for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_experts-8.json"
            if os.path.exists(path):
                fit = fit_latency_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}")
                c1[i, j] = fit[0]
                c2[i, j] = fit[1]
                curr_c1 = fit[0]
                curr_c2 = fit[1]
                baseline_rps, baseline_itl = getITL(path)
                # sort the baseline_itl indices
                indx = np.argsort(baseline_itl)
                # remove the top 5 entries for both baseline_rps and baseline_itl
                # but do not change the order otherwise
                indx = indx[:-2]
                idnx = np.sort(indx)
                baseline_rps = baseline_rps[idnx]
                baseline_itl = baseline_itl[idnx]
                plt.plot(baseline_itl / curr_c1, baseline_rps * curr_c2, color = 'red', alpha = 0.2)


    Qwen_c1.append(c1)
    Qwen_c2.append(c2)

Qwen_c1 = []
Qwen_c2 = []

for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_experts-16.json"
            if os.path.exists(path):
                fit = fit_latency_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}")
                c1[i, j] = fit[0]
                c2[i, j] = fit[1]
                curr_c1 = fit[0]
                curr_c2 = fit[1]
                baseline_rps, baseline_itl = getITL(path)
                # sort the baseline_itl indices
                indx = np.argsort(baseline_itl)
                # remove the top 5 entries for both baseline_rps and baseline_itl
                # but do not change the order otherwise
                indx = indx[:-2]
                idnx = np.sort(indx)
                baseline_rps = baseline_rps[idnx]
                baseline_itl = baseline_itl[idnx]
                plt.plot(baseline_itl / curr_c1, baseline_rps * curr_c2, color = 'blue', alpha = 0.2)


    Qwen_c1.append(c1)
    Qwen_c2.append(c2)

Qwen_c1 = []
Qwen_c2 = []

for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_experts-32.json"
            if os.path.exists(path):
                fit = fit_latency_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}")
                c1[i, j] = fit[0]
                c2[i, j] = fit[1]
                curr_c1 = fit[0]
                curr_c2 = fit[1]
                baseline_rps, baseline_itl = getITL(path)
                # sort the baseline_itl indices
                indx = np.argsort(baseline_itl)
                # remove the top 5 entries for both baseline_rps and baseline_itl
                # but do not change the order otherwise
                indx = indx[:-2]
                idnx = np.sort(indx)
                baseline_rps = baseline_rps[idnx]
                baseline_itl = baseline_itl[idnx]
                plt.plot(baseline_itl / curr_c1, baseline_rps * curr_c2, color = 'green', alpha = 0.2)


    Qwen_c1.append(c1)
    Qwen_c2.append(c2)

In [None]:
Qwen_c1 = []
Qwen_c2 = []

for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_experts-32.json"
            if os.path.exists(path):
                fit = fit_latency_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}")
                c1[i, j] = fit[0]
                c2[i, j] = fit[1]
                curr_c1 = fit[0]
                curr_c2 = fit[1]
                baseline_rps, baseline_itl = getITL(path)
                # sort the baseline_itl indices
                indx = np.argsort(baseline_itl)
                # remove the top 5 entries for both baseline_rps and baseline_itl
                # but do not change the order otherwise
                indx = indx[:-2]
                idnx = np.sort(indx)
                baseline_rps = baseline_rps[idnx]
                baseline_itl = baseline_itl[idnx]
                plt.plot(baseline_itl / curr_c1, baseline_rps * curr_c2)


    Qwen_c1.append(c1)
    Qwen_c2.append(c2)

In [None]:
Qwen_c1 = []
Qwen_c2 = []

for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_prompt-512_output-128_concurrency-128_reduced.json"
            fit = fit_latency_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}")
            c1[i, j] = fit[0]
            c2[i, j] = fit[1]
            curr_c1 = fit[0]
            curr_c2 = fit[1]
            baseline_rps, baseline_itl = getITL(path)
            # sort the baseline_itl indices
            indx = np.argsort(baseline_itl)
            # remove the top 5 entries for both baseline_rps and baseline_itl
            # but do not change the order otherwise
            indx = indx[:-2]
            idnx = np.sort(indx)
            baseline_rps = baseline_rps[idnx]
            baseline_itl = baseline_itl[idnx]
            plt.plot(baseline_itl / curr_c1, baseline_rps * curr_c2)


    Qwen_c1.append(c1)
    Qwen_c2.append(c2)

In [None]:
Qwen_c1 = []
Qwen_c2 = []
for model_size in tqdm(model_sizes):

    c1 = np.empty((len(alphas), len(drafts)))
    c1[:] = np.nan
    c2 = np.empty((len(alphas), len(drafts)))
    c2[:] = np.nan

    for i, alpha in enumerate(alphas):
        for j, draft in enumerate(drafts):
            path = f"{DATA_DIR}/output_Qwen3-{model_size}_FS_alpha-{alpha:.2f}_draft-{draft}_prompt-512_output-128_concurrency-128_reduced.json"
            fit = fit_ITL_from_RPS(path, f"Alpha: {alpha}, Draft: {draft}")
            c1[i, j] = fit[0]
            c2[i, j] = fit[1]

    Qwen_c1.append(c1)
    Qwen_c2.append(c2)

In [None]:
c1