In [75]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from datetime import timedelta

In [57]:
slurm = pd.read_csv("/mnt/research/CMSE495-SS24-ICER/slurm_usage/DID_FINAL_SLURM_OCT_2023.csv",delimiter="|",nrows=1e7)


In [58]:
slurm = slurm.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
slurm.head()

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
0,31496544,user_679,group_121,2023-03-21T11:13:45,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
1,31497932,user_679,group_121,2023-03-21T11:31:18,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
2,31993628,user_105,group_114,2023-03-22T18:19:12,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_661,...,0,12,0,150G,,1,1,None assigned,"billing=23347,cpu=12,gres/gpu=8,mem=150G,node=1",
3,39087660,user_652,group_054,2023-04-04T13:09:10,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_557,...,0,640,0,20G,,10,10,None assigned,"billing=3112,cpu=640,mem=20G,node=10",
4,59062820,user_188,group_046,2023-05-08T09:58:20,2024-01-01T00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"


In [59]:
slurm.columns

Index(['JobID', 'User', 'Group', 'Submit', 'Start', 'End', 'Elapsed', 'State',
       'Account', 'AssocID', 'Partition', 'Timelimit', 'UserCPU', 'SystemCPU',
       'TotalCPU', 'CPUTime', 'CPUTimeRAW', 'ReqCPUS', 'AllocCPUS', 'ReqMem',
       'MaxRSS', 'ReqNodes', 'NNodes', 'NodeList', 'ReqTRES', 'AllocTRES'],
      dtype='object')

# Preprocessing Data

In [74]:
# # convert Submit and Start to datetime, and filter rows with 'Unknown' start

# # Filter out rows where 'State' is "Cancelled" or Unknown
slurm = slurm[slurm['State'] != 'Cancelled']
slurm = slurm[slurm['Start']!= 'Unknown']
slurm = slurm.dropna(subset=['Start'])
slurm = slurm.dropna(subset=['Timelimit'])


slurm['Submit'] = pd.to_datetime(slurm['Submit'])
slurm['Start'] = pd.to_datetime(slurm['Start'])


# Dropping user_258 (job batches)
# slurm = slurm[slurm['User']!='user_258'] 
# user 258 includes information about the used memery column (MaxRSS) which means we can't drop user258

slurm.head(500)

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES,UnderUtilizeCPU,UnderUtilizeNodes
4,59062820,user_188,group_046,2023-05-08 09:58:20,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0
5,59062828,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0
6,59062836,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,skl-030,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0
7,59062844,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0
8,59062852,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,90397252,user_606,group_000,2023-09-25 17:30:39,2023-10-01 12:59:42,2023-10-01T13:03:43,00:04:01,COMPLETED,account_002,assocID_790,...,20,160G,,1,1,nvl-003,"billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1","billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1",0,0
1084,90397252,user_606,group_000,2023-09-25 17:30:39,2023-10-01 13:04:39,2023-10-01T13:12:07,00:07:28,COMPLETED,account_002,assocID_790,...,20,160G,,1,1,nvl-002,"billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1","billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1",0,0
1087,90397252,user_606,group_000,2023-09-25 17:30:39,2023-10-01 13:08:44,2023-10-01T13:11:56,00:03:12,COMPLETED,account_002,assocID_790,...,20,160G,,1,1,nvl-004,"billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1","billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1",0,0
1090,90397252,user_606,group_000,2023-09-25 17:30:39,2023-10-01 13:08:45,2023-10-01T13:12:36,00:03:51,COMPLETED,account_002,assocID_790,...,20,160G,,1,1,nvl-003,"billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1","billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1",0,0


In [61]:
# observing different units in Memory
print(set(slurm['ReqMem']))
print(set(slurm['MaxRSS']))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Feature Engineering After Mapping user258 to find underutilization in Memery

In [None]:
slurm = slurm.dropna(subset=['ReqMem'])     # Dropping nan ReqMem as they all belong to user_258

# convert ReqMem to a uniform measurement (assuming 'M' for MB and 'G' for GB and 'K' for KB)
def convert_memory(mem_str):
    if mem_str.endswith('K'):
        return float(mem_str[:-1]) # remove 'K' and convert to float
    elif mem_str.endswith('M'):
        return float(mem_str[:-1]) * 1000
    elif mem_str.endswith('G'):
        return float(mem_str[:-1]) * 1000000  # convert GB to KB
    elif mem_str.endswith('T'):
        return float(mem_str[:-1]) * 1000000000 # convert Tb to KB
    
    

slurm['ReqMem_KB'] = slurm['ReqMem'].apply(convert_memory)

In [87]:
def FindUnterutilizerSLURM(data, thresholds):
    """
    Identifies users who are requesting more resources than they use.

    :param data: DataFrame containing SLURM job records.
    :param thresholds: Dictionary with keys 'cpu' and 'mem' for utilization thresholds.
    :return: List of underutilizing users.
    """
    # Function to parse timedata columns: TimeLimit and Elapsed
    def parse_time_string(time_str):
        """Convert a time string into a timedelta object."""
        days = 0
        if '-' in time_str:
            days, time_str = time_str.split('-')
            days = int(days)
    
        hours, minutes, seconds = map(int, time_str.split(':'))
        return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)

    slurm['Timelimit_td'] = slurm['Timelimit'].apply(parse_time_string)
    slurm['Elapsed_td'] = slurm['Elapsed'].apply(parse_time_string)

    
    
    data['UnderUtilizeCPU'] = data.ReqCPUS - data.AllocCPUS    # Subtract AllocCPUS fromReqCPUS
    data['UnderUtilizeNodes'] = data.ReqNodes - data.NNodes   # Subtract NNodes from ReqNodes
    data['UnderUtilizeTime'] = data['Timelimit_td'] - slurm['Elapsed_td']   # Subtract Elapsed from Timelimit

In [89]:
slurm

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES,UnderUtilizeCPU,UnderUtilizeNodes,Timelimit_td,Elapsed_td,UnderUtilizeTime
4,59062820,user_188,group_046,2023-05-08 09:58:20,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:09,1 days 23:59:51
5,59062828,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:06,1 days 23:59:54
6,59062836,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,1,1,skl-030,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:06,1 days 23:59:54
7,59062844,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:09,1 days 23:59:51
8,59062852,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:09,1 days 23:59:51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6376303,99254104,user_178,group_154,2023-10-31 23:59:41,2023-11-01 02:51:56,2023-11-01T02:53:44,00:01:48,COMPLETED,account_012,assocID_393,...,1,1,amr-004,"billing=76790,cpu=1,mem=505202M,node=1","billing=76790,cpu=1,mem=505202M,node=1",0,0,0 days 01:00:00,0 days 00:01:48,0 days 00:58:12
6376304,99254108,user_466,group_096,2023-10-31 23:59:45,2023-11-01 01:43:36,2023-11-01T01:45:09,00:01:33,COMPLETED,account_079,assocID_801,...,1,1,lac-398,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1",0,0,0 days 00:10:00,0 days 00:01:33,0 days 00:08:27
6376305,99254112,user_466,group_096,2023-10-31 23:59:47,2023-11-01 01:43:36,2023-11-01T01:45:58,00:02:22,COMPLETED,account_079,assocID_801,...,1,1,lac-403,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1",0,0,0 days 00:10:00,0 days 00:02:22,0 days 00:07:38
6376306,99254116,user_466,group_096,2023-10-31 23:59:52,2023-11-01 01:43:36,2023-11-01T01:44:50,00:01:14,COMPLETED,account_079,assocID_801,...,1,1,lac-426,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1",0,0,0 days 00:10:00,0 days 00:01:14,0 days 00:08:46


In [85]:
# The higher the number is the more the user underutilizes

slurm['UnderUtilizeCPU'] = slurm.ReqCPUS - slurm.AllocCPUS
slurm['UnderUtilizeNodes'] = slurm.ReqNodes - slurm.NNodes

In [67]:
# set(slurm.UnderUtilizeCPU)
# set(slurm.UnderUtilizeNodes)

In [81]:
# set(slurm.Timelimit)

In [82]:
def parse_time_string(time_str):
    """Convert a time string into a timedelta object."""
    days = 0
    if '-' in time_str:
        days, time_str = time_str.split('-')
        days = int(days)
    
    hours, minutes, seconds = map(int, time_str.split(':'))
    return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)

slurm['Timelimit_td'] = slurm['Timelimit'].apply(parse_time_string)
slurm['Elapsed_td'] = slurm['Elapsed'].apply(parse_time_string)
slurm

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES,UnderUtilizeCPU,UnderUtilizeNodes,Timelimit_td,Elapsed_td
4,59062820,user_188,group_046,2023-05-08 09:58:20,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:09
5,59062828,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:06
6,59062836,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,,1,1,skl-030,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:06
7,59062844,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:09
8,59062852,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6376303,99254104,user_178,group_154,2023-10-31 23:59:41,2023-11-01 02:51:56,2023-11-01T02:53:44,00:01:48,COMPLETED,account_012,assocID_393,...,,1,1,amr-004,"billing=76790,cpu=1,mem=505202M,node=1","billing=76790,cpu=1,mem=505202M,node=1",0,0,0 days 01:00:00,0 days 00:01:48
6376304,99254108,user_466,group_096,2023-10-31 23:59:45,2023-11-01 01:43:36,2023-11-01T01:45:09,00:01:33,COMPLETED,account_079,assocID_801,...,,1,1,lac-398,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1",0,0,0 days 00:10:00,0 days 00:01:33
6376305,99254112,user_466,group_096,2023-10-31 23:59:47,2023-11-01 01:43:36,2023-11-01T01:45:58,00:02:22,COMPLETED,account_079,assocID_801,...,,1,1,lac-403,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1",0,0,0 days 00:10:00,0 days 00:02:22
6376306,99254116,user_466,group_096,2023-10-31 23:59:52,2023-11-01 01:43:36,2023-11-01T01:44:50,00:01:14,COMPLETED,account_079,assocID_801,...,,1,1,lac-426,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1",0,0,0 days 00:10:00,0 days 00:01:14


In [83]:
# Subtract Elapsed from Timelimit
slurm['UnderUtilizeTime'] = slurm['Timelimit_td'] - slurm['Elapsed_td']

In [84]:
slurm

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES,UnderUtilizeCPU,UnderUtilizeNodes,Timelimit_td,Elapsed_td,UnderUtilizeTime
4,59062820,user_188,group_046,2023-05-08 09:58:20,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:09,1 days 23:59:51
5,59062828,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:06,1 days 23:59:54
6,59062836,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,1,1,skl-030,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:06,1 days 23:59:54
7,59062844,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:09,1 days 23:59:51
8,59062852,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",0,0,2 days 00:00:00,0 days 00:00:09,1 days 23:59:51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6376303,99254104,user_178,group_154,2023-10-31 23:59:41,2023-11-01 02:51:56,2023-11-01T02:53:44,00:01:48,COMPLETED,account_012,assocID_393,...,1,1,amr-004,"billing=76790,cpu=1,mem=505202M,node=1","billing=76790,cpu=1,mem=505202M,node=1",0,0,0 days 01:00:00,0 days 00:01:48,0 days 00:58:12
6376304,99254108,user_466,group_096,2023-10-31 23:59:45,2023-11-01 01:43:36,2023-11-01T01:45:09,00:01:33,COMPLETED,account_079,assocID_801,...,1,1,lac-398,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1",0,0,0 days 00:10:00,0 days 00:01:33,0 days 00:08:27
6376305,99254112,user_466,group_096,2023-10-31 23:59:47,2023-11-01 01:43:36,2023-11-01T01:45:58,00:02:22,COMPLETED,account_079,assocID_801,...,1,1,lac-403,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1",0,0,0 days 00:10:00,0 days 00:02:22,0 days 00:07:38
6376306,99254116,user_466,group_096,2023-10-31 23:59:52,2023-11-01 01:43:36,2023-11-01T01:44:50,00:01:14,COMPLETED,account_079,assocID_801,...,1,1,lac-426,"billing=3112,cpu=1,mem=20G,node=1","billing=3112,cpu=1,mem=20G,node=1",0,0,0 days 00:10:00,0 days 00:01:14,0 days 00:08:46
