In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from datetime import timedelta

In [2]:
slurm = pd.read_csv("/mnt/research/CMSE495-SS24-ICER/slurm_usage/DID_FINAL_SLURM_OCT_2023.csv",delimiter="|", nrows=100000)


In [3]:
slurm = slurm.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
slurm.head()

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
0,31496544,user_679,group_121,2023-03-21T11:13:45,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
1,31497932,user_679,group_121,2023-03-21T11:31:18,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
2,31993628,user_105,group_114,2023-03-22T18:19:12,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_661,...,0,12,0,150G,,1,1,None assigned,"billing=23347,cpu=12,gres/gpu=8,mem=150G,node=1",
3,39087660,user_652,group_054,2023-04-04T13:09:10,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_557,...,0,640,0,20G,,10,10,None assigned,"billing=3112,cpu=640,mem=20G,node=10",
4,59062820,user_188,group_046,2023-05-08T09:58:20,2024-01-01T00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"


In [4]:
slurm.columns

Index(['JobID', 'User', 'Group', 'Submit', 'Start', 'End', 'Elapsed', 'State',
       'Account', 'AssocID', 'Partition', 'Timelimit', 'UserCPU', 'SystemCPU',
       'TotalCPU', 'CPUTime', 'CPUTimeRAW', 'ReqCPUS', 'AllocCPUS', 'ReqMem',
       'MaxRSS', 'ReqNodes', 'NNodes', 'NodeList', 'ReqTRES', 'AllocTRES'],
      dtype='object')

In [5]:
slurm.shape

(100000, 26)

# Preprocessing Data

In [6]:
# # convert Submit and Start to datetime, and filter rows with 'Unknown' start

# # Filter out rows where 'State' is "Cancelled" or Unknown
slurm = slurm[slurm['State'] != 'Cancelled']
slurm = slurm[slurm['Start']!= 'Unknown']

slurm['Submit'] = pd.to_datetime(slurm['Submit'])
slurm['Start'] = pd.to_datetime(slurm['Start'])

slurm.head(5)

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
4,59062820,user_188,group_046,2023-05-08 09:58:20,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
5,59062828,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,240,40,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
6,59062836,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,240,40,40,8G,,1,1,skl-030,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
7,59062844,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
8,59062852,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"


In [7]:
# remove user_258 and clean jobs data
def AggSLURMDat(dat):
    '''
    Aggregates all submitted jobs together, removing all batch/extern 
    jobs and including said information into a single job. Excludes
    jobs that do not have a clear '.batch' and '.extern' files

    args:

        dat - the slurm dataset 
    
    returns:

        out_df - the aggregated version of the slurm dataset
    '''
    
    job_list = dat["JobID"].value_counts().index

    out_df = pd.DataFrame(columns=dat.keys())

    for job in job_list:

        jdat = dat[dat["JobID"] == job]

        cpu_time_list = jdat["CPUTimeRAW"].value_counts()
        cpu_time_list = cpu_time_list[cpu_time_list == 2].index

        for cpu_time in cpu_time_list:

            ajob = jdat[jdat["CPUTimeRAW"] == cpu_time]

            batch_job = ajob[ajob["User"] == "user_258"]

            ag_job = ajob[ajob["User"] != "user_258"]

            if len(ag_job["User"]) == 0:
#                 print("Weird Job",ajob["JobID"])
#                 print("No aggregate job")
                continue
            
            if len(ag_job["User"]) == 2:
#                 print("Weird Job",ajob["JobID"])
#                 print("2 copies of aggregate job")
                continue

            assert len(ag_job["User"]) == 1, "New edge case discovered!"

            ag_job.loc[ag_job.index[0],"MaxRSS"] = batch_job["MaxRSS"].values[0]

            out_df = pd.concat([out_df,ag_job])

    return out_df

In [8]:
# test
slurm_cleaned = AggSLURMDat(slurm)
slurm_cleaned

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
75766,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:11:08,00:02:38,COMPLETED,account_017,assocID_554,...,158,1,1,10G,348380K,1,1,lac-351,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
75880,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:09:18,00:00:48,FAILED,account_017,assocID_554,...,48,1,1,10G,8156K,1,1,lac-367,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
77026,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:10:33,00:02:02,FAILED,account_017,assocID_554,...,122,1,1,10G,344968K,1,1,lac-295,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
77281,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:11:31,00:03:00,COMPLETED,account_017,assocID_554,...,180,1,1,10G,489700K,1,1,lac-299,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
76963,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:09:36,00:01:05,FAILED,account_017,assocID_554,...,65,1,1,10G,270400K,1,1,lac-294,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29569,91740004,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:32,2023-10-01T00:46:35,00:07:03,COMPLETED,account_030,assocID_561,...,1692,4,4,8G,531208K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"
29572,91740008,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:38,2023-10-01T00:46:42,00:07:04,COMPLETED,account_030,assocID_561,...,1696,4,4,8G,528908K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"
29575,91740012,user_128,group_062,2023-10-01 00:38:00,2023-10-01 00:39:43,2023-10-01T00:45:59,00:06:16,COMPLETED,account_030,assocID_561,...,1504,4,4,8G,490392K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"
29530,91739952,user_128,group_062,2023-10-01 00:37:49,2023-10-01 00:38:49,2023-10-01T00:50:17,00:11:28,COMPLETED,account_030,assocID_561,...,2752,4,4,8G,747872K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"


 # Feature Engineering of Cleaned SLURM 
 - After Mapping user258 to find underutilization in Memery
 
 1. Convert memory units to all be in MB columns:ReqMem, MaxRSS
 2. Convert Timelimit and Elapsed columns from object to datetime

In [9]:
slurm_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15689 entries, 75766 to 29500
Data columns (total 26 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   JobID       15689 non-null  object        
 1   User        15689 non-null  object        
 2   Group       15689 non-null  object        
 3   Submit      15689 non-null  datetime64[ns]
 4   Start       15689 non-null  datetime64[ns]
 5   End         15689 non-null  object        
 6   Elapsed     15689 non-null  object        
 7   State       15689 non-null  object        
 8   Account     15689 non-null  object        
 9   AssocID     15689 non-null  object        
 10  Partition   15689 non-null  object        
 11  Timelimit   15689 non-null  object        
 12  UserCPU     15689 non-null  object        
 13  SystemCPU   15689 non-null  object        
 14  TotalCPU    15689 non-null  object        
 15  CPUTime     15689 non-null  object        
 16  CPUTimeRAW  15689 

In [10]:
# remove na rows in MaxRSS after cleaning data
slurm_cleaned= slurm_cleaned.dropna(subset=['MaxRSS'])
slurm_cleaned

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
75766,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:11:08,00:02:38,COMPLETED,account_017,assocID_554,...,158,1,1,10G,348380K,1,1,lac-351,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
75880,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:09:18,00:00:48,FAILED,account_017,assocID_554,...,48,1,1,10G,8156K,1,1,lac-367,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
77026,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:10:33,00:02:02,FAILED,account_017,assocID_554,...,122,1,1,10G,344968K,1,1,lac-295,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
77281,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:11:31,00:03:00,COMPLETED,account_017,assocID_554,...,180,1,1,10G,489700K,1,1,lac-299,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
76963,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:09:36,00:01:05,FAILED,account_017,assocID_554,...,65,1,1,10G,270400K,1,1,lac-294,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29569,91740004,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:32,2023-10-01T00:46:35,00:07:03,COMPLETED,account_030,assocID_561,...,1692,4,4,8G,531208K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"
29572,91740008,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:38,2023-10-01T00:46:42,00:07:04,COMPLETED,account_030,assocID_561,...,1696,4,4,8G,528908K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"
29575,91740012,user_128,group_062,2023-10-01 00:38:00,2023-10-01 00:39:43,2023-10-01T00:45:59,00:06:16,COMPLETED,account_030,assocID_561,...,1504,4,4,8G,490392K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"
29530,91739952,user_128,group_062,2023-10-01 00:37:49,2023-10-01 00:38:49,2023-10-01T00:50:17,00:11:28,COMPLETED,account_030,assocID_561,...,2752,4,4,8G,747872K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"


In [11]:
# no null values
# slurm_cleaned.info()

print(set(slurm_cleaned.ReqMem))
print(set(slurm_cleaned.MaxRSS))

{'5G', '50000M', '48000M', '20G', '300G', '16192M', '44G', '30G', '480G', '4500M', '96000M', '40G', '70G', '100G', '500G', '64G', '2.50G', '110G', '16G', '490G', '1T', '3000M', '2T', '24G', '128G', '512G', '1200G', '2500G', '8000M', '4G', '505202M', '285G', '900G', '2016G', '750M', '60G', '192G', '50G', '164G', '36G', '18G', '320G', '512M', '75G', '800G', '160G', '96G', '10G', '120G', '8G', '384G', '750G', '28G', '80G', '200G', '600G', '2G', '32G'}
{'2478648K', '1553816K', '1279692K', '21759916K', '359284K', '99324K', '23857216K', '1278768K', '1279252K', '3614608K', '1278728K', '380940K', '23532700K', '1274136K', '23971016K', '395596K', '6560K', '1272740K', '1271932K', '476256K', '780924K', '393744K', '1279080K', '93680K', '780584K', '56440K', '27004K', '93300K', '87792K', '1267060K', '1638980K', '780780K', '1340352K', '2338620K', '24043788K', '1194360K', '23729840K', '416548K', '2165352K', '2047764K', '1244M', '61628K', '92452K', '395016K', '115188K', '65423544K', '4231552K', '23716K'

In [12]:
# # convert ReqMem to a uniform measurement (assuming 'M' for MB and 'G' for GB and 'K' for KB)
def convert_memory(mem_str):
    '''
    Convert memory units to MegaBytes unit float.
    '''
    if mem_str.endswith('M'):
        return float(mem_str[:-1]) # remove 'M' and convert to float
    elif mem_str.endswith('K'):
        return float(mem_str[:-1]) / 1000
    elif mem_str.endswith('G'):
        return float(mem_str[:-1]) * 1e3  # convert MB to KB
    elif mem_str.endswith('T'):
        return float(mem_str[:-1]) * 1e6 # convert MB to T
    
    
# Correcting the warning by using .loc to assign new values
slurm_cleaned.loc[:, 'ReqMem_MB'] = slurm_cleaned['ReqMem'].apply(convert_memory).values
slurm_cleaned.loc[:, 'MaxRSS_MB'] = slurm_cleaned['MaxRSS'].apply(convert_memory).values

slurm_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slurm_cleaned.loc[:, 'ReqMem_MB'] = slurm_cleaned['ReqMem'].apply(convert_memory).values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slurm_cleaned.loc[:, 'MaxRSS_MB'] = slurm_cleaned['MaxRSS'].apply(convert_memory).values


Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES,ReqMem_MB,MaxRSS_MB
75766,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:11:08,00:02:38,COMPLETED,account_017,assocID_554,...,1,10G,348380K,1,1,lac-351,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,348.380
75880,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:09:18,00:00:48,FAILED,account_017,assocID_554,...,1,10G,8156K,1,1,lac-367,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,8.156
77026,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:10:33,00:02:02,FAILED,account_017,assocID_554,...,1,10G,344968K,1,1,lac-295,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,344.968
77281,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:11:31,00:03:00,COMPLETED,account_017,assocID_554,...,1,10G,489700K,1,1,lac-299,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,489.700
76963,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:09:36,00:01:05,FAILED,account_017,assocID_554,...,1,10G,270400K,1,1,lac-294,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,270.400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29569,91740004,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:32,2023-10-01T00:46:35,00:07:03,COMPLETED,account_030,assocID_561,...,4,8G,531208K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1",8000.0,531.208
29572,91740008,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:38,2023-10-01T00:46:42,00:07:04,COMPLETED,account_030,assocID_561,...,4,8G,528908K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1",8000.0,528.908
29575,91740012,user_128,group_062,2023-10-01 00:38:00,2023-10-01 00:39:43,2023-10-01T00:45:59,00:06:16,COMPLETED,account_030,assocID_561,...,4,8G,490392K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1",8000.0,490.392
29530,91739952,user_128,group_062,2023-10-01 00:37:49,2023-10-01 00:38:49,2023-10-01T00:50:17,00:11:28,COMPLETED,account_030,assocID_561,...,4,8G,747872K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1",8000.0,747.872


In [13]:
# Function to parse time data columns: TimeLimit and Elapsed
def parse_time_string(time_str):
    """Convert a time string into a timedelta object."""
    days = 0
    if '-' in time_str:
        days, time_str = time_str.split('-')
        days = int(days)

    parts = time_str.split(':')
    hours, minutes, seconds = map(int, parts) if len(parts) == 3 else (int(parts[0]), int(parts[1]), 0)
    return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)


# # Convert Timelimit and Elapsed to timedeltas
slurm_cleaned.loc[:, 'Timelimit'] = slurm_cleaned['Timelimit'].apply(parse_time_string)
slurm_cleaned.loc[:, 'Elapsed'] = slurm_cleaned['Elapsed'].apply(parse_time_string)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slurm_cleaned.loc[:, 'Timelimit'] = slurm_cleaned['Timelimit'].apply(parse_time_string)
  slurm_cleaned.loc[:, 'Timelimit'] = slurm_cleaned['Timelimit'].apply(parse_time_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  slurm_cleaned.loc[:, 'Elapsed'] = slurm_cleaned['Elapsed'].apply(parse_time_string)
  slurm_cleaned.loc[:, 'Elapsed'] = slurm_cleaned['Elapsed'].apply(parse_time_string)


In [14]:
slurm_cleaned.head()

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES,ReqMem_MB,MaxRSS_MB
75766,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:11:08,0 days 00:02:38,COMPLETED,account_017,assocID_554,...,1,10G,348380K,1,1,lac-351,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,348.38
75880,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:09:18,0 days 00:00:48,FAILED,account_017,assocID_554,...,1,10G,8156K,1,1,lac-367,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,8.156
77026,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:10:33,0 days 00:02:02,FAILED,account_017,assocID_554,...,1,10G,344968K,1,1,lac-295,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,344.968
77281,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:11:31,0 days 00:03:00,COMPLETED,account_017,assocID_554,...,1,10G,489700K,1,1,lac-299,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,489.7
76963,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:09:36,0 days 00:01:05,FAILED,account_017,assocID_554,...,1,10G,270400K,1,1,lac-294,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,270.4


# Function to list users that underutilize CPU, Time, and Nodes 

In [23]:
# The higher the number is the more the user underutilizes

def FindUnterutilizerSLURM(data, time_threshold, cpu_threshold, nodes_threshold, memory_threshold_percent):
    """
    Identifies SLURM job records that underutilize allocated resources compared to requested ones.
    Each resource (time, CPU, nodes, memory) is evaluated against its threshold; jobs that exceed
    any threshold are included in the output. The result is a DataFrame listing underutilizing jobs
    along with the amount by which they underutilize each resource.

    Parameters:
    :param data: DataFrame with SLURM job records.
    :param time_threshold: Jobs underutilizing time beyond this threshold (hours) are flagged.
    :param cpu_threshold: Jobs underutilizing CPUs beyond this threshold are flagged.
    :param nodes_threshold: Jobs underutilizing nodes beyond this threshold are flagged.
    :param memory_threshold: Jobs underutilizing memory beyond this threshold (MB) are flagged.
    
    Returns:
    DataFrame with columns: 'User', 'JobID', 'Group', 'State', 'Account', 'UnderUtilizeCPU', 
    'UnderUtilizeNodes', 'UnderUtilizeTime (Hours)', 'UnderUtilizeMemory (MB)'.
    """
    
    # Underutilized resources calculation
    data['UnderUtilizeCPU'] = data.ReqCPUS - data.AllocCPUS    
    data['UnderUtilizeNodes'] = data.ReqNodes - data.NNodes  
    data['UnderUtilizeTime (Hours)'] = (data['Timelimit'] - data['Elapsed']) / pd.Timedelta(hours=1)   # Convert UnderUtilizeTime to hours for comparison
    data['UnderUtilizeMemory (MB)'] = 1 - (data['MaxRSS_MB'] / data['ReqMem_MB']) 
    
    # Conditions for underutilization
    data['UnderUtilizedTime'] = data['UnderUtilizeTime (Hours)'] > time_threshold
    data['UnderUtilizedCPU'] = data['UnderUtilizeCPU'] > cpu_threshold
    data['UnderUtilizedNodes'] = data['UnderUtilizeNodes'] > nodes_threshold
    data = data[data['UnderUtilizeMemory (MB)'] > memory_threshold_percent]
    
    # Filter rows where any underutilization condition is met
    underutilized = data[
        data['UnderUtilizedTime'] |
        data['UnderUtilizedCPU'] |
        data['UnderUtilizedNodes'] |
        data['UnderUtilizeMemory (MB)']
        ]
    
    return underutilized[['User','JobID','Group','State','Account','UnderUtilizeCPU','UnderUtilizeNodes',
                          'UnderUtilizeTime (Hours)','UnderUtilizeMemory (MB)']]


In [26]:
# test on 1000 rows
FindUnterutilizerSLURM(slurm_cleaned,7,0,0,0.95)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['UnderUtilizeCPU'] = data.ReqCPUS - data.AllocCPUS
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['UnderUtilizeNodes'] = data.ReqNodes - data.NNodes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['UnderUtilizeTime (Hours)'] = (data['Timelimit'] - data['Elapsed']) / pd.Timedelta(hours

Unnamed: 0,User,JobID,Group,State,Account,UnderUtilizeCPU,UnderUtilizeNodes,UnderUtilizeTime (Hours),UnderUtilizeMemory (MB)
75766,user_316,91806820,group_009,COMPLETED,account_017,0,0,0.272778,0.965162
75880,user_316,91806820,group_009,FAILED,account_017,0,0,0.303333,0.999184
77026,user_316,91806820,group_009,FAILED,account_017,0,0,0.282778,0.965503
77281,user_316,91806820,group_009,COMPLETED,account_017,0,0,0.266667,0.951030
76963,user_316,91806820,group_009,FAILED,account_017,0,0,0.298611,0.972960
...,...,...,...,...,...,...,...,...,...
29085,user_128,91739060,group_062,COMPLETED,account_030,0,0,23.967500,0.963855
29088,user_128,91739064,group_062,COMPLETED,account_030,0,0,23.966667,0.964559
29094,user_128,91739072,group_062,COMPLETED,account_030,0,0,23.966667,0.964011
29100,user_128,91739080,group_062,COMPLETED,account_030,0,0,23.966944,0.964412


In [None]:
# remove warnings, add table