In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans

In [41]:
# load in the dataset, specify number of rows: 
slurm = pd.read_csv("/mnt/research/CMSE495-SS24-ICER/slurm_usage/DID_FINAL_SLURM_OCT_2023.csv",delimiter="|")


In [42]:
# drop not needed columns
slurm = slurm.drop(columns=["Unnamed: 0.1","Unnamed: 0"])

In [44]:
print("The size of SLURM dataset is:",slurm.shape)
slurm

The size of SLURM dataset is: (6376308, 26)


Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
0,31496544,user_679,group_121,2023-03-21T11:13:45,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
1,31497932,user_679,group_121,2023-03-21T11:31:18,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
2,31993628,user_105,group_114,2023-03-22T18:19:12,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_661,...,0,12,0,150G,,1,1,None assigned,"billing=23347,cpu=12,gres/gpu=8,mem=150G,node=1",
3,39087660,user_652,group_054,2023-04-04T13:09:10,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_557,...,0,640,0,20G,,10,10,None assigned,"billing=3112,cpu=640,mem=20G,node=10",
4,59062820,user_188,group_046,2023-05-08T09:58:20,2024-01-01T00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6376303,99254104,user_178,group_154,2023-10-31T23:59:41,2023-11-01T02:51:56,2023-11-01T02:53:44,00:01:48,COMPLETED,account_012,assocID_393,...,108,1,1,505202M,,1,1,amr-004,"billing=76790,cpu=1,mem=505202M,node=1","billing=76790,cpu=1,mem=505202M,node=1"
6376304,99254108,user_466,group_096,2023-10-31T23:59:45,2023-11-01T01:43:36,2023-11-01T01:45:09,00:01:33,COMPLETED,account_079,assocID_801,...,93,1,1,20G,,1,1,lac-398,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1"
6376305,99254112,user_466,group_096,2023-10-31T23:59:47,2023-11-01T01:43:36,2023-11-01T01:45:58,00:02:22,COMPLETED,account_079,assocID_801,...,142,1,1,20G,,1,1,lac-403,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1"
6376306,99254116,user_466,group_096,2023-10-31T23:59:52,2023-11-01T01:43:36,2023-11-01T01:44:50,00:01:14,COMPLETED,account_079,assocID_801,...,74,1,1,20G,,1,1,lac-426,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1"


In [45]:
slurm.columns

Index(['JobID', 'User', 'Group', 'Submit', 'Start', 'End', 'Elapsed', 'State',
       'Account', 'AssocID', 'Partition', 'Timelimit', 'UserCPU', 'SystemCPU',
       'TotalCPU', 'CPUTime', 'CPUTimeRAW', 'ReqCPUS', 'AllocCPUS', 'ReqMem',
       'MaxRSS', 'ReqNodes', 'NNodes', 'NodeList', 'ReqTRES', 'AllocTRES'],
      dtype='object')

## Identify ghostUsers

**Not having allocated remomy makes sense since a user can make a huge request and having it not go through resulting in NaN allocation
but didn't understand why there would be NaN ReqTRES and an AllocTRES jobs available. I did more investigation 
and found that only one user_id is the cause of all of that, Users that are doing this: {'user_258'}, 
Number of times allocationTRES was given to a NaN Request: 369**

In [46]:
# only two columns: Resource Requested and Allocation given to each job
slurm[["JobID","ReqTRES","AllocTRES"]]

Unnamed: 0,JobID,ReqTRES,AllocTRES
0,31496544,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
1,31497932,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
2,31993628,"billing=23347,cpu=12,gres/gpu=8,mem=150G,node=1",
3,39087660,"billing=3112,cpu=640,mem=20G,node=10",
4,59062820,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
...,...,...,...
6376303,99254104,"billing=76790,cpu=1,mem=505202M,node=1","billing=76790,cpu=1,mem=505202M,node=1"
6376304,99254108,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1"
6376305,99254112,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1"
6376306,99254116,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1"


In [49]:
# Number of times allocationTRES was given to a NaN Request
print("Count of NULL Resource requested =  NaN:",len(list(slurm[slurm["ReqTRES"].isnull()]["User"])))
print("Users that are doing this:",set(list(slurm[slurm["ReqTRES"].isnull()]["User"])) )

Count of NULL Resource requested =  NaN: 4202408
Users that are doing this: {'user_258'}


Finding:
<!--  -->
**4,202,408 is number of times allocationTRES was given to a NaN Request by only User: user_258. This is 65% of the SLURM data that dont have ResouceTRS allocated.**

In [50]:
# function to identify users that have AllocTRES populating without ReqTRES: NaN
def identify_ghostUsers(data):
    print("Number of times allocationTRES was given to a NaN Request:",len(list(data[data["ReqTRES"].isnull()]["User"])))
    print("Users that are doing this:",set(list(data[data["ReqTRES"].isnull()]["User"])))


# testing function
identify_ghostUsers(slurm)

Number of times allocationTRES was given to a NaN Request: 4202408
Users that are doing this: {'user_258'}


In [52]:
# crape the entire dataset and get every job from that user, 
# to 1) see how many there are in total and 
# 2) maybe we can look at the timestamps to see if there is any repeatability.

jobs_by_user_258 = slurm[slurm['User']=="user_258"]
jobs_by_user_258

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
202,87461884,user_258,group_010,2023-09-29T21:30:06,2023-09-29T21:30:06,2023-10-01T05:42:37,1-08:12:31,COMPLETED,account_011,assocID_575,...,1855216,16,16,,983172K,1,1,lac-084,,"cpu=16,gres/gpu=1,mem=8G,node=1"
203,87461884,user_258,group_010,2023-09-29T21:30:06,2023-09-29T21:30:06,2023-10-01T05:42:40,1-08:12:34,COMPLETED,account_011,assocID_575,...,7421056,64,64,,0,4,4,"lac-[084-086,193]",,"billing=4980,cpu=64,gres/gpu=4,mem=32G,node=4"
204,87461884,user_258,group_010,2023-09-30T05:08:57,2023-09-30T05:08:57,2023-10-01T05:42:40,1-00:33:43,COMPLETED,account_011,assocID_575,...,4244304,48,48,,4500K,3,3,"lac-[085-086,193]",,"cpu=48,gres/gpu=3,mem=24G,node=3"
206,87461892,user_258,group_010,2023-09-30T00:16:32,2023-09-30T00:16:32,2023-10-01T06:19:40,1-06:03:08,COMPLETED,account_011,assocID_575,...,1731008,16,16,,1076136K,1,1,lac-142,,"cpu=16,gres/gpu=1,mem=8G,node=1"
207,87461892,user_258,group_010,2023-09-30T00:16:32,2023-09-30T00:16:32,2023-10-01T06:19:40,1-06:03:08,COMPLETED,account_011,assocID_575,...,6924032,64,64,,0,4,4,"lac-[142,192,345,349]",,"billing=4980,cpu=64,gres/gpu=4,mem=32G,node=4"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6375732,99253828,user_258,group_010,2023-10-31T23:58:33,2023-10-31T23:58:33,2023-11-01T12:17:23,12:18:50,COMPLETED,account_017,assocID_749,...,88660,2,2,,0,2,2,"amr-[250,253]",,"billing=37355,cpu=2,mem=240G,node=2"
6375780,99254020,user_258,group_010,2023-10-31T23:59:11,2023-10-31T23:59:11,2023-11-01T12:03:08,12:03:57,COMPLETED,account_017,assocID_749,...,43437,1,1,,4873744K,1,1,amr-197,,"cpu=1,mem=120G,node=1"
6375781,99254020,user_258,group_010,2023-10-31T23:59:11,2023-10-31T23:59:11,2023-11-01T12:03:13,12:04:02,COMPLETED,account_017,assocID_749,...,86884,2,2,,0,2,2,"amr-[197,218]",,"billing=37355,cpu=2,mem=240G,node=2"
6375799,99254088,user_258,group_010,2023-10-31T23:59:39,2023-10-31T23:59:39,2023-11-01T11:41:15,11:41:36,COMPLETED,account_017,assocID_749,...,42096,1,1,,4040116K,1,1,amr-220,,"cpu=1,mem=120G,node=1"


In [55]:
# jobs_by_user_258.info()
# Loop through each column in the DataFrame
for column in jobs_by_user_258.columns:
    # Count the number of unique entries in each column
    unique_count = jobs_by_user_258[column].nunique()
    # Get the unique entries for the column
    unique_values = jobs_by_user_258[column].unique()
    
    # Print the count and the unique values
    print(f"Column: {column}")
    print(f"Number of unique entries: {unique_count}")
    print(f"Unique values: {unique_values}\n")

Column: JobID
Number of unique entries: 1396246
Unique values: [87461884 87461892 87461900 ... 99253828 99254020 99254088]

Column: User
Number of unique entries: 1
Unique values: ['user_258']

Column: Group
Number of unique entries: 1
Unique values: ['group_010']

Column: Submit
Number of unique entries: 281743
Unique values: ['2023-09-29T21:30:06' '2023-09-30T05:08:57' '2023-09-30T00:16:32' ...
 '2023-10-31T23:58:33' '2023-10-31T23:59:11' '2023-10-31T23:59:39']

Column: Start
Number of unique entries: 281743
Unique values: ['2023-09-29T21:30:06' '2023-09-30T05:08:57' '2023-09-30T00:16:32' ...
 '2023-10-31T23:58:33' '2023-10-31T23:59:11' '2023-10-31T23:59:39']

Column: End
Number of unique entries: 710874
Unique values: ['2023-10-01T05:42:37' '2023-10-01T05:42:40' '2023-10-01T06:19:40' ...
 '2023-11-01T12:17:20' '2023-11-01T12:03:08' '2023-11-01T12:03:13']

Column: Elapsed
Number of unique entries: 94328
Unique values: ['1-08:12:31' '1-08:12:34' '1-00:33:43' ... '20:51:58' '12:03:57'


Column: Partition
Number of unique entries: 1
Unique values: ['partitionGroup_050']

Column: Timelimit
Number of unique entries: 0
Unique values: [nan]

Column: UserCPU
Number of unique entries: 762450
Unique values: ['5-04:34:52' '00:00.004' '00:00.674' ... '17:58.333' '14:47.859'
 '13:39.558']

Column: SystemCPU
Number of unique entries: 360603
Unique values: ['04:15:15' '00:00.001' '00:03.317' ... '05:29.300' '11:42.826'
 '10:55.620']

Column: TotalCPU
Number of unique entries: 776981
Unique values: ['5-08:50:08' '00:00.006' '00:03.991' ... '22:56.790' '18:45.519'
 '17:31.694']

Column: CPUTime
Number of unique entries: 136305
Unique values: ['21-11:20:16' '85-21:24:16' '49-02:58:24' ... '1-18:23:48' '12:18:47'
 '12:03:57']

Column: CPUTimeRAW
Number of unique entries: 136305
Unique values: [1855216 7421056 4244304 ...  152628   44327   43437]

Column: ReqCPUS
Number of unique entries: 202
Unique values: [  16   64   48    2  128    1    4   20   32    3   50   25   60  180
  100   

In [57]:
# df["underutilizerCPUS"] = df['AllocCPUS'] - df['ReqCPUS']
# df["underutilizerNodes"] = df['NNodes'] - df['ReqNodes']
# df

In [58]:
def time_to_int(x):
    """
    Breaks up SLURM timestamps to a list of integers

    args:
        x (str) - string of time in 'YYYY-MM-DDTHH:MinMin:SS'
    
    returns:
        list - contains integer values of year, month, day
                hours, minutes and seconds in that order
    """
    year = x[0:4]
    month = x[5:7]
    day = x[8:10]
    hr = x[11:13]
    mins = x[14:16]
    sec = x[17:19]

    return [int(year), int(month), int(day), int(hr), int(mins), int(sec)]

def elapsed_to_secs(x):
    """
    Breaks up SLURM elapsed column to seconds

    args:
        x = string of time in 'YYYY-MM-DDTHH:MinMin:SS'
    
    returns:
        seconds (int) - number seconds elapsed since start of job
    """
    
    hr = int(x[0:2])
    mins = int(x[3:5])
    sec = int(x[6:8])
    
    return hr*3600 + mins*60 + sec

# change date columns types and extract time
jobs_by_user_258["Submit"] = jobs_by_user_258["Submit"]