In [5]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import pandas as pd
import numpy as np
from matplotlib.dates import DateFormatter
from pathlib import Path

import seaborn as sns
sns.set_context("talk")

### Load in data

In [11]:
files = Path('out.csv')
total_data = pd.read_csv(files, sep='|')
total_data['rss_GB'] = total_data['rss'] / (1024**3)
total_data['vms_GB'] = total_data['vms'] / (1024**3)
total_data['shared_GB'] = total_data['shared'] / (1024**3)
total_data.sort_index(inplace=True)

pd.to_datetime(total_data['@datetime']).dt.tz_convert('US/Pacific')

0     2024-05-25 18:54:56.848859029-07:00
1     2024-05-25 18:54:56.849183326-07:00
2     2024-05-25 18:54:56.856405495-07:00
3     2024-05-25 18:54:56.857913207-07:00
4     2024-05-25 18:54:56.858229829-07:00
                      ...                
365   2024-05-25 18:55:06.472265332-07:00
366   2024-05-25 18:55:06.472560213-07:00
367   2024-05-25 18:55:06.472845195-07:00
368   2024-05-25 18:55:06.473118912-07:00
369   2024-05-25 18:55:06.473416551-07:00
Name: @datetime, Length: 370, dtype: datetime64[ns, US/Pacific]

### Sorts unique process names by the number of time points

In [3]:
name, counts = np.unique(total_data.cmd, return_counts=True)
_sort = np.argsort(counts)[::-1]

name = name[_sort]
counts = counts[_sort]

for n, c in zip(name, counts):
    print(n, c)

/home/ranger/.vscode-server/cli/servers/Stable-dc96b837cf6bb4af9cd736aa3af08cf8279f7685/server/node 56
-zsh 40
/bin/zsh 16
python3 16
sshd: ranger@pts/4 8
/lib/systemd/systemd 8
./target/debug/pagurus 8
/bin/sh 8
/home/ranger/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/libexec/rust-analyzer-proc-macro-srv 8
/home/ranger/.vscode-server/code-dc96b837cf6bb4af9cd736aa3af08cf8279f7685 8
/home/ranger/.vscode-server/data/User/globalStorage/ms-dotnettools.vscode-dotnet-runtime/.dotnet/6.0.30~x64/dotnet 8
/home/ranger/.vscode-server/extensions/rust-lang.rust-analyzer-0.3.1958-linux-x64/server/rust-analyzer 8
/usr/lib/plexmediaserver/Plex Media Server 8
sshd: ranger@pts/2 8
/usr/lib/plexmediaserver/Plex Tuner Service 8
Plex Plug-in [com.plexapp.system] 8
bash 8
sh 8
sleep 8
ssh 8
sshd: ranger@notty 8
sshd: ranger@pts/0 8
sshd: ranger@pts/1 8
(sd-pam) 8


### Aggregate processes by name and give a table

In [4]:
total_data.groupby('cmd').agg(
    {
        'rss_GB': ['max', 'mean'],
        'vms_GB': ['max', 'mean']
    }
)

Unnamed: 0_level_0,rss_GB,rss_GB,vms_GB,vms_GB
Unnamed: 0_level_1,max,mean,max,mean
cmd,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
(sd-pam),0.001488,0.001488,0.161804,0.161804
-zsh,0.007999,0.006548,0.012337,0.010358
./target/debug/pagurus,0.00164,0.001468,0.069229,0.069117
/bin/sh,0.000927,0.000927,0.002758,0.002758
/bin/zsh,0.006771,0.006712,0.008354,0.008242
/home/ranger/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/libexec/rust-analyzer-proc-macro-srv,0.005554,0.00523,0.077858,0.069068
/home/ranger/.vscode-server/cli/servers/Stable-dc96b837cf6bb4af9cd736aa3af08cf8279f7685/server/node,0.725327,0.190889,11.507294,3.844815
/home/ranger/.vscode-server/code-dc96b837cf6bb4af9cd736aa3af08cf8279f7685,0.01474,0.01474,0.034199,0.034199
/home/ranger/.vscode-server/data/User/globalStorage/ms-dotnettools.vscode-dotnet-runtime/.dotnet/6.0.30~x64/dotnet,0.013672,0.013672,3.227318,3.227318
/home/ranger/.vscode-server/extensions/rust-lang.rust-analyzer-0.3.1958-linux-x64/server/rust-analyzer,0.787487,0.777163,1.368313,1.368273


### Plot the memory usage of the workflow over time

In [None]:
# remove duplicate legends
# https://stackoverflow.com/a/56253636
def legend_without_duplicate_labels(ax):
    handles, labels = ax.get_legend_handles_labels()
    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
    leg = ax.legend(*zip(*unique), loc='center left', bbox_to_anchor=(1.05, 0.75), ncol=3, fancybox=True)
    # Set legend to alpha=1.0
    # https://stackoverflow.com/a/42403471
    for lh in leg.legendHandles: 
        lh.set_alpha(1)

fig, ax = plt.subplots(figsize=(12,8))
# Get a color map
cmap = cm.get_cmap('tab20')

colors = {}

bad_progs = ["alloc"]


# Give each process a unique color
for n, color in zip(name, cmap.colors):
    if n in bad_progs:
        continue
    if n in colors.keys():
        continue
    else:
        print(n)
        colors[n] = color

# Plot the memory usage over time including all processes
for n, c in zip(name, counts):
    if n in bad_progs:
        continue
    data = total_data[total_data['cmd'] == n]
    #data = data[data.index > "2022-09-02 9:00:00.000000"]
    pids = data.pid.unique()

    for pid in pids:
        process_data = data[data.pid == pid]
        if len(process_data) < 100:
            continue
        process_data = data[data.pid == pid]
        mem = process_data.mem_vms_GB.rolling("1s").mean()
        try:
            mem.plot(c=colors[n], alpha=0.5, label=n)
        except KeyError:
            mem.plot(c='black', alpha=0.5, label="")

legend_without_duplicate_labels(ax)
date_form = DateFormatter("%H:%M")
ax.xaxis.set_major_formatter(date_form)
plt.ylim(0, 6)
plt.title(f'Workflow memory usage over time')
plt.ylabel("mem usage GB")
plt.show()

In [None]:
total_data.sort_index(inplace=True)

# remove duplicate legends
# https://stackoverflow.com/a/56253636
def legend_without_duplicate_labels(ax):
    handles, labels = ax.get_legend_handles_labels()
    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
    leg = ax.legend(*zip(*unique), 
                    #loc='center left', 
                    #bbox_to_anchor=(0.05, 0.75), 
                    ncol=3, fancybox=True)
    # Set legend to alpha=1.0
    # https://stackoverflow.com/a/42403471
    for lh in leg.legendHandles: 
        lh.set_alpha(1)

fig, ax = plt.subplots(figsize=(12,8), nrows=2, sharex=True)

# Give each process a unique color
for n, color in zip(name, cmap.colors):
    if n in bad_progs:
        continue
    if n in colors.keys():
        continue
    else:
        print(n)
        colors[n] = color
        
bad_progs = ["pigz","mv", "sh", "cp", "data_plugin", "curl_plugin", "alloc"]
# Plot the memory usage over time including all processes
for n, c in zip(name, counts):
    if n in bad_progs:
        continue
    data = total_data[total_data['name'] == n]
    data = data[data.index > "2022-09-02 9:00:00.000000"]
    pids = data.pid.unique()

    for pid in pids:
        process_data = data[data.pid == pid]
        nid = process_data.node.iloc[0]
        jid = process_data.jobid.iloc[0]
        if len(process_data) > 10000:
            continue
        # This only works because we get the derivative over 10 seconds
        # You need to change a bit for different time intervals
        total_cpu_time = (process_data.cpu_system + process_data.cpu_user + process_data.cpu_iowait)
        cpu = (100/10)*(total_cpu_time).rolling('10s').apply(lambda x: x[-1] - x[0])
        cpu.resample('120s')
        mem = process_data.mem_vms_GB.rolling("10s").mean()
        try:
            cpu.plot(c=colors[n], alpha=0.8, label=n, ax=ax[1])
            mem.plot(c=colors[n], alpha=0.8, label=n, ax=ax[0])
        except KeyError as e:
            print(e)
            cpu.plot(c='black', alpha=0.8)

# plt.ylim(0,120)
legend_without_duplicate_labels(ax[1])

date_form = DateFormatter("%H:%M")
ax[1].xaxis.set_major_formatter(date_form)
plt.ylim(0, None)
plt.subplots_adjust(wspace=0, hspace=0)
# ax[0].set_yscale("log")
# plt.title(f'Node {nid} CPU usage jobid {jid}')
ax[0].set_title(f'Workflow Performance')
ax[1].set_ylabel("CPU Usage [%]")
ax[0].set_ylabel("Memory Usage [GB]")
plt.show()

In [None]:
colors

In [None]:
total_data.sort_index(inplace=True)

# remove duplicate legends
# https://stackoverflow.com/a/56253636
def legend_without_duplicate_labels(ax):
    handles, labels = ax.get_legend_handles_labels()
    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
    leg = ax.legend(*zip(*unique), loc='center left', bbox_to_anchor=(0.05, 0.75), ncol=3, fancybox=True)
    # Set legend to alpha=1.0
    # https://stackoverflow.com/a/42403471
    for lh in leg.legendHandles: 
        lh.set_alpha(1)

nodes = total_data.node.unique()
colors = {}
# Give each process a unique color
for n, color in zip(name, cmap.colors):
    if n in colors.keys():
        continue
    else:
        print(n)
        colors[n] = color
        
for nid in nodes:
    fig, ax = plt.subplots(figsize=(12,8))

    bad_prog = ["hisat2-align-s", 'featureCounts', 'data_plugin', 'curl_plugin', 'mv', 'cp']
    # Plot the memory usage over time including all processes
    for n, c in zip(name, counts):
        if n in bad_prog:
            continue
        data = total_data[total_data['name'] == n]
        # data = data[data.node == nid]
        data = data[data.index < "2022-07-13 14:00:00.000000"]
        pids = data.pid.unique()

        for pid in pids:
            process_data = data[data.pid == pid]
            nid = process_data.iloc[0].node
            if len(process_data) > 10000:
                continue
            # This only works because we get the derivative over 10 seconds
            # You need to change a bit for different time intervals
            total_cpu_time = (process_data.cpu_system + process_data.cpu_user + process_data.cpu_iowait)
            cpu = (100/10)*(total_cpu_time).rolling('10s').apply(lambda x: x[-1] - x[0])
            cpu.resample('60s')
            try:
                if np.max(cpu) > 400:
                    continue
                if np.mean(cpu) < 2:
                    continue
                cpu.plot(c=colors[n],alpha=0.8, label=n)
            except KeyError as e:
                print(e)
                cpu.plot(c='black', alpha=0.8)

    # plt.ylim(0,120)
    legend_without_duplicate_labels(ax)

    date_form = DateFormatter("%H:%M")
    ax.xaxis.set_major_formatter(date_form)
    plt.ylim(0,None)
    plt.title(f'Workflow CPU usage over {len(nodes)} nodes')
    plt.ylabel("CPU Usage [%]")
    plt.show()