# Can we predict how long a submitted job will sit in the queue? (And what the most important characteristics of a job are that make it sit in the queue?) 

- Response: Queue time = Start – Submit 

- Predictors:  cpu, memory, node (billing?) 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [2]:
slurm = pd.read_csv("/mnt/research/CMSE495-SS24-ICER/slurm_usage/DID_FINAL_SLURM_OCT_2023.csv",delimiter="|",nrows=1000)


In [3]:
slurm = slurm.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
slurm.head()

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
0,31496544,user_679,group_121,2023-03-21T11:13:45,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
1,31497932,user_679,group_121,2023-03-21T11:31:18,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
2,31993628,user_105,group_114,2023-03-22T18:19:12,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_661,...,0,12,0,150G,,1,1,None assigned,"billing=23347,cpu=12,gres/gpu=8,mem=150G,node=1",
3,39087660,user_652,group_054,2023-04-04T13:09:10,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_557,...,0,640,0,20G,,10,10,None assigned,"billing=3112,cpu=640,mem=20G,node=10",
4,59062820,user_188,group_046,2023-05-08T09:58:20,2024-01-01T00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"


In [4]:
slurm.columns

Index(['JobID', 'User', 'Group', 'Submit', 'Start', 'End', 'Elapsed', 'State',
       'Account', 'AssocID', 'Partition', 'Timelimit', 'UserCPU', 'SystemCPU',
       'TotalCPU', 'CPUTime', 'CPUTimeRAW', 'ReqCPUS', 'AllocCPUS', 'ReqMem',
       'MaxRSS', 'ReqNodes', 'NNodes', 'NodeList', 'ReqTRES', 'AllocTRES'],
      dtype='object')

# Preprocessing Data

In [25]:
# convert Submit and Start to datetime, and filter rows with 'Unknown' start

# Filter out rows where 'State' is "Cancelled" or Unknown
slurm = slurm[slurm['State'] != 'Cancelled']
slurm = slurm[slurm['Start']!= 'Unknown']
slurm = slurm.dropna(subset=['Start'])


slurm['Submit'] = pd.to_datetime(slurm['Submit'])
slurm['Start'] = pd.to_datetime(slurm['Start'])

In [26]:
# calculate queutime in minutes
slurm["QueueTime"] = (slurm['Start']- slurm['Submit']).dt.total_seconds()/60


In [27]:
set(slurm.ReqMem)
# droping nan values for ReqMemory
slurm = slurm.dropna(subset=['ReqMem'])
set(slurm.ReqMem)


{'160G',
 '16166464M',
 '18G',
 '192G',
 '1T',
 '200G',
 '2500G',
 '32G',
 '360G',
 '44G',
 '50G',
 '8083232M',
 '8G',
 '900G'}

# Feature Engineering

In [28]:
# convert ReqMem to a uniform measurement (assuming 'M' for MB and 'G' for GB)
def convert_memory(mem_str):
    if mem_str.endswith('M'):
        return float(mem_str[:-1]) # remove 'M' and convert to float
    elif mem_str.endswith('G'):
        return float(mem_str[:-1]) * 1000  # convert GB to MB
    elif mem_str.endswith('T'):
        return float(mem_str[:-1]) * 1000000 # convert Tb to MG
    
    

slurm['ReqMem_MB'] = slurm['ReqMem'].apply(convert_memory)

In [29]:
slurm
# build an outlier detector, quetime; 
# discovered a data anomoly relating to outliers that were found when looking at how long
# finding users that have sat in the queue for a really
# each job sits in the queue. 
# long time, given that their ReqTRES was not a valid reason for it to sit that long
# we have talked to Dirk and he suggested the reasoning behind that is that it may be the 
# user got blocked and somehow got unblocked. 

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES,QueueTime,ReqMem_MB
4,59062820,user_188,group_046,2023-05-08 09:58:20,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",342180.616667,8000.0
5,59062828,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",342182.483333,8000.0
6,59062836,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,skl-030,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",342182.483333,8000.0
7,59062844,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",342180.600000,8000.0
8,59062852,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,40,8G,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1",342180.600000,8000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,90363856,user_082,group_127,2023-09-25 14:33:58,2023-09-28 07:33:43,2023-10-02T09:48:30,4-02:14:47,FAILED,account_017,assocID_153,...,25,200G,,1,1,amr-204,"billing=31129,cpu=5,mem=200G,node=1","billing=31129,cpu=25,mem=200G,node=1",3899.750000,200000.0
988,90363924,user_082,group_127,2023-09-25 14:34:09,2023-09-28 07:39:31,2023-10-02T09:51:03,4-02:11:32,FAILED,account_017,assocID_153,...,25,200G,,1,1,acm-028,"billing=31129,cpu=5,mem=200G,node=1","billing=31129,cpu=25,mem=200G,node=1",3905.366667,200000.0
991,90363928,user_082,group_127,2023-09-25 14:34:17,2023-09-29 00:46:25,2023-10-02T09:48:19,3-09:01:54,FAILED,account_017,assocID_153,...,25,200G,,1,1,amr-221,"billing=31129,cpu=5,mem=200G,node=1","billing=31129,cpu=25,mem=200G,node=1",4932.133333,200000.0
994,90366424,user_082,group_127,2023-09-25 14:41:57,2023-09-29 00:54:50,2023-10-02T09:48:18,3-08:53:28,FAILED,account_017,assocID_153,...,25,200G,,1,1,amr-234,"billing=31129,cpu=5,mem=200G,node=1","billing=31129,cpu=25,mem=200G,node=1",4932.883333,200000.0


# Predictive modeling

**Linear Regression**

# Random Forest Model

In [30]:
# prepare the dataset for training
X = slurm[['ReqCPUS','ReqMem_MB','ReqNodes']]
y = slurm['QueueTime']

# splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state=42)

# standarize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a random Forest Regressor 
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# predict on the test set
y_pred = model.predict(X_test_scaled)

# evaluate the model
print("Mean Absolute Error", mean_absolute_error(y_test,y_pred))

Mean Absolute Error 3456.9430797447767
