In [None]:
import pandas

from datetime import datetime, timedelta
from matplotlib import pyplot

In [None]:
df = pandas.read_csv("scaling_network_bound_3.csv")
df

In [None]:
for label, curdf in df.groupby("partitions"):
    print(f"npartitions: {label}, nlambdas: {len(curdf.groupby('uuid'))}")

In [None]:
dflambdas = {}
for label, curdf in df.groupby("partitions"):
    dflambdas[label] = curdf
    runtime = (curdf.endTime.max() - curdf.startTime.min())/1000
    speedup = 1 / (runtime / 777.818)
    print(f"npartitions: {label}, Runtime: {runtime} seconds, speedup: {speedup}")

In [None]:
start_timestamp = df_8lambdas["startTime"].min() / 1000
end_timestamp = df_8lambdas["endTime"].max() / 1000
start_datetime = datetime.fromtimestamp(start_timestamp)
end_datetime = datetime.fromtimestamp(end_timestamp)

print(start_datetime)
print(end_datetime)
print(f"Total runtime: {round(end_timestamp - start_timestamp, 2)} seconds")

In [None]:
pyplot.figure(figsize=(10,10))
for number, (label, lambdadf) in enumerate(df_8lambdas.groupby("uuid")):
    pyplot.plot(lambdadf.groupby("startTime")["measurementID"].count()+ number)
pyplot.xlabel("Timestamp")
pyplot.ylabel("Lambda invokation")

In [None]:
minstartdf = df_8lambdas.groupby('uuid')[["startTime"]].min().reset_index()
minstartdf["startTime"] = minstartdf["startTime"] / 1000

pyplot.figure(figsize=(10,10))
pyplot.plot(minstartdf.index, minstartdf["startTime"]-minstartdf["startTime"].min())

In [None]:
maxenddf = df_8lambdas.groupby('uuid')[["endTime"]].max().reset_index()
maxenddf["endTime"] = maxenddf["endTime"] / 1000

pyplot.figure(figsize=(12,12))
pyplot.title("Ending time of each lambda relative to the first one")
pyplot.xlabel("Lambda index")
pyplot.ylabel("Time (seconds)")
pyplot.plot(maxenddf.index, maxenddf["endTime"]-maxenddf["endTime"].min())

In [None]:
runtimedf = df_8lambdas.groupby('uuid')[["startTime","endTime"]].agg(minstart=("startTime","min"),maxend=("endTime","max")).reset_index()
runtimedf["runtime"] = (runtimedf["maxend"] - runtimedf["minstart"]) / 1000
runtimedf

pyplot.figure(figsize=(10,10))
pyplot.plot(runtimedf.index, runtimedf["runtime"])

In [None]:
cpudf = df_8lambdas[["uuid","startTime","endTime","cpuUsr"]]
cpudf

In [None]:
cpudf = df_8lambdas[["uuid","startTime","endTime","cpuUsr"]]
dfs = []
for label, curdf in cpudf.groupby('uuid'):
    curdf["cpuPercent"] = curdf.cpuUsr.diff().shift(-1)
    dfs.append(curdf)


cpupercentdf = pandas.concat(dfs).reset_index(drop=True)
cpupercentdf["startDateTime"] = (cpupercentdf["startTime"]/pow(10,3)).apply(datetime.fromtimestamp)
cpupercentdf

In [None]:
networkdf = df_8lambdas[["uuid","startTime","endTime","network_rx_bytes.vinternal_1"]]
dfs = []
for label, curdf in networkdf.groupby('uuid'):
    curdf["networkpersecond"] = curdf["network_rx_bytes.vinternal_1"].diff().shift(-1)
    dfs.append(curdf)


networkperseconddf = pandas.concat(dfs).reset_index(drop=True)
networkperseconddf["startDateTime"] = (networkperseconddf["startTime"]/pow(10,3)).apply(datetime.fromtimestamp)
networkperseconddf

# Creating the Bins

In [None]:
metrics = dflambdas[64][["uuid","startTime","cpuUsr","network_rx_bytes.vinternal_1"]].copy()
metrics

In [None]:
metrics["startDateTime"] = (metrics["startTime"]/pow(10,3)).apply(datetime.fromtimestamp)
metrics

In [None]:
def round_seconds(obj: datetime, how="previous") -> datetime:
    if how=="previous":
        obj -= timedelta(seconds=1)
    else:
        obj += timedelta(seconds=1)
    return obj.replace(microsecond=0)

timebins = []
current_datetime = round_seconds(start_datetime)
while current_datetime <= end_datetime:
    timebins.append(current_datetime)
    current_datetime += timedelta(seconds=2)
timebins.append(round_seconds(end_datetime, how="next"))
timebins

In [None]:
metrics["bins"] = pandas.cut(metrics["startDateTime"], timebins)
metrics

In [None]:
pyplot.figure(figsize=(10,10))
for number, (label, lambdadf) in enumerate(metrics.groupby("uuid")):
    pyplot.plot(lambdadf.groupby("startTime")["cpuUsr"].count() + number)
pyplot.xlabel("Timestamp")
pyplot.ylabel("Lambda invokation")

In [None]:
for grouplabel, groupdf in metrics.groupby(["bins"]):
    #print(f"Label: {grouplabel}")
    print(f"Entries: {len(groupdf)}")
    #print(f"DataFrame: {groupdf}")
    print(len(groupdf.uuid.unique()))

In [None]:
cpupercentdf["bins"] = pandas.cut(cpupercentdf["startDateTime"], timebins)
cpupercentdf

In [None]:
networkperseconddf["bins"] = pandas.cut(networkperseconddf["startDateTime"], timebins)
networkperseconddf

In [None]:
for label, curdf in networkperseconddf.groupby("uuid"):
    pyplot.plot(curdf["startDateTime"], curdf["networkpersecond"])

In [None]:
pyplot.boxplot(networkperseconddf["networkpersecond"].dropna()/pow(10,6))

In [None]:
for grouplabel, groupdf in cpupercentdf.groupby(["bins"]):
    #print(f"Label: {grouplabel}")
    print(f"Entries: {len(groupdf)}")
    #print(f"DataFrame: {groupdf}")
    print(len(groupdf.uuid.unique()))

In [None]:
def f(series):
    return len(series.unique())
cpusumdf = metrics.groupby("bins").agg(cpu_sum=("cpuUsr","sum"), numlambdas=("uuid",f)).reset_index()
cpusumdf

In [None]:
cpusumdfpercent = cpupercentdf.groupby("bins").agg(cpu_sum=("cpuPercent","sum"), numlambdas=("uuid",f)).reset_index()
cpusumdfpercent["timelabels"] = cpusumdfpercent["bins"].apply(lambda x: x.left)
init_time = cpusumdfpercent["timelabels"][0]
cpusumdfpercent["timelabels"] = cpusumdfpercent["timelabels"].apply(lambda x: (x - init_time).seconds)
cpusumdfpercent

In [None]:
cpusumdfpercent["bins"].apply(lambda x: x.left)

In [None]:
networksumdf = metrics.groupby("bins").agg(network_sum=("network_rx_bytes.vinternal_1","sum"), numlambdas=("uuid",f)).reset_index()
networksumdf

In [None]:
networkcumsumdf = metrics[["bins","network_rx_bytes.vinternal_1"]].groupby("bins").sum().cumsum().reset_index()
fig, ax = pyplot.subplots(figsize=(10,10))
pyplot.plot(networkcumsumdf["bins"].apply(lambda x: x.left), networkcumsumdf["network_rx_bytes.vinternal_1"]/pow(10,9)/2, label="Data Transferred")
pyplot.xlabel("Absolute time")
pyplot.ylabel("Cumulative data transferred (GB)")
fig.savefig('network_sum_pps.png',
            format='png',
            dpi=300,
            bbox_inches='tight')

In [None]:
cpucumsumdf = cpusumdf[["bins","cpu_sum"]].groupby("bins").sum().cumsum().reset_index()

fig, ax = pyplot.subplots(figsize=(10,10))
pyplot.plot(cpucumsumdf["bins"].apply(lambda x: x.left), cpucumsumdf["cpu_sum"]/2, label="CPU Usage")
pyplot.xlabel("Absolute time")
pyplot.ylabel("CPU usage (sum)")
fig.savefig('cpuusage_sum.png',
            format='png',
            dpi=300,
            bbox_inches='tight')

In [None]:
fig,ax = pyplot.subplots(figsize=(10,10))
ax.bar(cpusumdfpercent.index, cpusumdfpercent["cpu_sum"], color="green")
ax.set_ylabel("CPU Usage (Percentage)", color="green")
ax2 = ax.twinx()
ax2.bar(networksumdf.index, networksumdf["network_sum"]/pow(10,9), color="blue")
ax2.set_ylabel("Transfered data (GB)", color="blue")
ax.set_xlabel("Time increment (2 seconds)")
fig.savefig('cpuandnetwork.png',
            format='png',
            dpi=300,
            bbox_inches='tight')

In [None]:
fig,ax = pyplot.subplots(figsize=(10,10))
ax.plot(cpusumdf["numlambdas"])
#pyplot.yticks(list(range(0,70,5)))
pyplot.xlabel("Time increment (2 seconds)")
pyplot.ylabel("Number of Lambdas")
fig.savefig('numlambdas.png',
            format='png',
            dpi=300,
            bbox_inches='tight')

In [None]:
fig,ax = pyplot.subplots(figsize=(10,10))
ax.bar(cpusumdfpercent.index, cpusumdfpercent["cpu_sum"]/2)
ax.set_ylabel("CPU Usage (Percentage)", color="green")


# 128 Lambda
Chosen because 128 is the highest number that still shows nice behaviour

In [None]:
minstartdf = df_128lambdas.groupby('uuid')[["startTime"]].min().reset_index()
minstartdf["startTime"] = minstartdf["startTime"] / 1000

fig, ax = pyplot.subplots(figsize=(10,10))
#pyplot.title("Starting time of each lambda relative to the first one", size=20)
pyplot.xlabel("Time [s]", size=20)
pyplot.ylabel("Occurences", size=20)
pyplot.tick_params(labelsize=20)

pyplot.hist(minstartdf["startTime"]-minstartdf["startTime"].min(), bins=25)

fig.savefig('relativestart_pps.png',
            format='png',
            dpi=400,
            bbox_inches='tight')

In [None]:
maxenddf = df_128lambdas.groupby('uuid')[["endTime"]].max().reset_index()
maxenddf["endTime"] = maxenddf["endTime"] / 1000


fig, ax = pyplot.subplots(figsize=(10,10))
#pyplot.title("Ending time of each lambda relative to the first one")
pyplot.xlabel("Time [s]", size=20)
pyplot.ylabel("Occurences", size=20)
pyplot.tick_params(labelsize=20)

pyplot.hist(maxenddf["endTime"]-maxenddf["endTime"].min(), bins=25)

fig.savefig('relativeend_pps.png',
            format='png',
            dpi=400,
            bbox_inches='tight')

# Network and CPU intervals

In [None]:
cpudf = dflambdas[64][["uuid","startTime","endTime","cpuUsr"]]
dfs = []
for label, curdf in cpudf.groupby('uuid'):
    curdf["cpuPercent"] = curdf.cpuUsr.diff().shift(-1)
    dfs.append(curdf)


cpupercentdf = pandas.concat(dfs).reset_index(drop=True)
cpupercentdf["startDateTime"] = (cpupercentdf["startTime"]/pow(10,3)).apply(datetime.fromtimestamp)
cpupercentdf

In [None]:
networkdf = dflambdas[64][["uuid","startTime","endTime","network_rx_bytes.vinternal_1"]]
dfs = []
for label, curdf in networkdf.groupby('uuid'):
    curdf["networkpersecond"] = curdf["network_rx_bytes.vinternal_1"].diff().shift(-1)
    dfs.append(curdf)


networkperseconddf = pandas.concat(dfs).reset_index(drop=True)
networkperseconddf["startDateTime"] = (networkperseconddf["startTime"]/pow(10,3)).apply(datetime.fromtimestamp)
networkperseconddf

In [None]:
fig, ax = pyplot.subplots(figsize=(16,9))

singlelambdacpu = cpupercentdf[cpupercentdf["uuid"] == cpupercentdf["uuid"][0]].copy()
singlelambdacpu["timefromstart"] = (singlelambdacpu["startTime"] - singlelambdacpu["startTime"][0])/ pow(10,3)

singlelambdanetwork = networkperseconddf[networkperseconddf["uuid"] == networkperseconddf["uuid"][0]].copy()
singlelambdanetwork["timefromstart"] = (singlelambdanetwork["startTime"] - singlelambdanetwork["startTime"][0]) / pow(10,3)

ax.plot(singlelambdacpu.timefromstart, singlelambdacpu.cpuPercent, color="orange")

ax.fill_between(singlelambdacpu.timefromstart, singlelambdacpu.cpuPercent, color='orange', alpha=0.2)
ax.set_xlabel("Time from start [s]", size=25)
ax.set_ylabel("CPU usage [%/s]", color="orange", size=25)
ax.tick_params(labelsize=25)

ax2 = ax.twinx()
ax2.plot(singlelambdanetwork.timefromstart, singlelambdanetwork.networkpersecond / pow (10, 6), color="blue")
ax2.fill_between(singlelambdanetwork.timefromstart, singlelambdanetwork.networkpersecond / pow (10, 6), color='blue', alpha=0.2)
ax2.set_ylabel("Network traffic [MB/s]", color="blue", size=25)
ax2.tick_params(labelsize=25)

fig.savefig('network_cpu_intervals.png',
            format='png',
            dpi=400,
            bbox_inches='tight')

# Aggregated CPU usage

In [None]:
cpudf = dflambdas[8][["uuid","startTime","endTime","cpuUsr"]]
dfs = []
for label, curdf in cpudf.groupby('uuid'):
    curdf["cpuPercent"] = curdf.cpuUsr.diff().shift(-1)
    dfs.append(curdf)


cpupercentdf = pandas.concat(dfs).reset_index(drop=True)
cpupercentdf["startDateTime"] = (cpupercentdf["startTime"]/pow(10,3)).apply(datetime.fromtimestamp)
cpupercentdf["endDateTime"] = (cpupercentdf["endTime"]/pow(10,3)).apply(datetime.fromtimestamp)

In [None]:
def round_seconds(obj: datetime, how="previous") -> datetime:
    if how=="previous":
        obj -= timedelta(seconds=1)
    else:
        obj += timedelta(seconds=1)
    return obj.replace(microsecond=0)

timebins = []
current_datetime = round_seconds(cpupercentdf["startDateTime"].min())
end_datetime = round_seconds(cpupercentdf["endDateTime"].max())
while current_datetime <= end_datetime:
    timebins.append(current_datetime)
    current_datetime += timedelta(seconds=2)
timebins.append(round_seconds(end_datetime, how="next"))


cpupercentdf["bins"] = pandas.cut(cpupercentdf["startDateTime"], timebins)

In [None]:
def f(series):
    return len(series.unique())

cpusumdfpercent = cpupercentdf.groupby("bins").agg(cpu_sum=("cpuPercent","sum"), numlambdas=("uuid",f)).reset_index()
cpusumdfpercent["timelabels"] = cpusumdfpercent["bins"].apply(lambda x: x.left)
init_time = cpusumdfpercent["timelabels"][0]
cpusumdfpercent["timelabels"] = cpusumdfpercent["timelabels"].apply(lambda x: (x - init_time).seconds)

In [None]:
fig, ax = pyplot.subplots(figsize=(16,9))
pyplot.plot(cpusumdfpercent["timelabels"], cpusumdfpercent["cpu_sum"]/2, label="CPU Usage")
pyplot.xlabel("Time from start [s]", size=25)
pyplot.ylabel("CPU usage [%]", size=25)
pyplot.tick_params(labelsize=25)

fig.savefig('cpuusage_pps.png',
            format='png',
            dpi=400,
            bbox_inches='tight')