In [90]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from datetime import timedelta

In [91]:
slurm = pd.read_csv("/mnt/research/CMSE495-SS24-ICER/slurm_usage/DID_FINAL_SLURM_OCT_2023.csv",delimiter="|",nrows=1e7)


In [92]:
slurm = slurm.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
slurm.head()

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
0,31496544,user_679,group_121,2023-03-21T11:13:45,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
1,31497932,user_679,group_121,2023-03-21T11:31:18,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_489,...,0,28,0,21000M,,1,1,None assigned,"billing=3192,cpu=28,gres/gpu=4,mem=21000M,node=1",
2,31993628,user_105,group_114,2023-03-22T18:19:12,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_661,...,0,12,0,150G,,1,1,None assigned,"billing=23347,cpu=12,gres/gpu=8,mem=150G,node=1",
3,39087660,user_652,group_054,2023-04-04T13:09:10,Unknown,Unknown,00:00:00,PENDING,account_017,assocID_557,...,0,640,0,20G,,10,10,None assigned,"billing=3112,cpu=640,mem=20G,node=10",
4,59062820,user_188,group_046,2023-05-08T09:58:20,2024-01-01T00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"


In [93]:
slurm.columns

Index(['JobID', 'User', 'Group', 'Submit', 'Start', 'End', 'Elapsed', 'State',
       'Account', 'AssocID', 'Partition', 'Timelimit', 'UserCPU', 'SystemCPU',
       'TotalCPU', 'CPUTime', 'CPUTimeRAW', 'ReqCPUS', 'AllocCPUS', 'ReqMem',
       'MaxRSS', 'ReqNodes', 'NNodes', 'NodeList', 'ReqTRES', 'AllocTRES'],
      dtype='object')

# Preprocessing Data

In [94]:
# # convert Submit and Start to datetime, and filter rows with 'Unknown' start

# # Filter out rows where 'State' is "Cancelled" or Unknown
slurm = slurm[slurm['State'] != 'Cancelled']
slurm = slurm[slurm['Start']!= 'Unknown']
slurm = slurm.dropna(subset=['Start'])
slurm = slurm.dropna(subset=['Timelimit'])


slurm['Submit'] = pd.to_datetime(slurm['Submit'])
slurm['Start'] = pd.to_datetime(slurm['Start'])


# Dropping user_258 (job batches)
# slurm = slurm[slurm['User']!='user_258'] 
# user 258 includes information about the used memery column (MaxRSS) which means we can't drop user258

slurm.head(500)

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
4,59062820,user_188,group_046,2023-05-08 09:58:20,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
5,59062828,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,240,40,40,8G,,1,1,skl-029,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
6,59062836,user_188,group_046,2023-05-08 09:58:20,2024-01-01 01:00:49,2024-01-01T01:00:55,00:00:06,COMPLETED,account_017,assocID_676,...,240,40,40,8G,,1,1,skl-030,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
7,59062844,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
8,59062852,user_188,group_046,2023-05-08 09:58:21,2024-01-01 00:58:57,2024-01-01T00:59:06,00:00:09,COMPLETED,account_017,assocID_676,...,360,40,40,8G,,1,1,amr-207,"billing=1245,cpu=40,mem=8G,node=1","billing=1245,cpu=40,mem=8G,node=1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081,90397252,user_606,group_000,2023-09-25 17:30:39,2023-10-01 12:59:42,2023-10-01T13:03:43,00:04:01,COMPLETED,account_002,assocID_790,...,4820,20,20,160G,,1,1,nvl-003,"billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1","billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1"
1084,90397252,user_606,group_000,2023-09-25 17:30:39,2023-10-01 13:04:39,2023-10-01T13:12:07,00:07:28,COMPLETED,account_002,assocID_790,...,8960,20,20,160G,,1,1,nvl-002,"billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1","billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1"
1087,90397252,user_606,group_000,2023-09-25 17:30:39,2023-10-01 13:08:44,2023-10-01T13:11:56,00:03:12,COMPLETED,account_002,assocID_790,...,3840,20,20,160G,,1,1,nvl-004,"billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1","billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1"
1090,90397252,user_606,group_000,2023-09-25 17:30:39,2023-10-01 13:08:45,2023-10-01T13:12:36,00:03:51,COMPLETED,account_002,assocID_790,...,4620,20,20,160G,,1,1,nvl-003,"billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1","billing=24903,cpu=20,gres/gpu=1,mem=160G,node=1"


In [61]:
# observing different units in Memory
print(set(slurm['ReqMem']))
print(set(slurm['MaxRSS']))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Feature Engineering After Mapping user258 to find underutilization in Memery

In [None]:
# slurm = slurm.dropna(subset=['ReqMem'])     # Dropping nan ReqMem as they all belong to user_258

# # convert ReqMem to a uniform measurement (assuming 'M' for MB and 'G' for GB and 'K' for KB)
# def convert_memory(mem_str):
#     if mem_str.endswith('K'):
#         return float(mem_str[:-1]) # remove 'K' and convert to float
#     elif mem_str.endswith('M'):
#         return float(mem_str[:-1]) * 1000
#     elif mem_str.endswith('G'):
#         return float(mem_str[:-1]) * 1000000  # convert GB to KB
#     elif mem_str.endswith('T'):
#         return float(mem_str[:-1]) * 1000000000 # convert Tb to KB
    
    

# slurm['ReqMem_KB'] = slurm['ReqMem'].apply(convert_memory)

# Function to list users that underutilize CPU, Time, and Nodes 

In [120]:
# The higher the number is the more the user underutilizes

def FindUnterutilizerSLURM(data, time_threshold, cpu_threshold, nodes_threshold):
    """
    Identifies users who are requesting more resources than they use.

    :param data: DataFrame containing SLURM job records.
    :param thresholds: Dictionary with keys 'cpu' and 'mem' for utilization thresholds.
    :return: DataFrame of underutilizing users and their job details.
    """
    
    # Function to parse time data columns: TimeLimit and Elapsed
    def parse_time_string(time_str):
        """Convert a time string into a timedelta object."""
        days = 0
        if '-' in time_str:
            days, time_str = time_str.split('-')
            days = int(days)
        
        parts = time_str.split(':')
        hours, minutes, seconds = map(int, parts) if len(parts) == 3 else (int(parts[0]), int(parts[1]), 0)
        return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)

    # # Convert Timelimit and Elapsed to timedeltas
    slurm['Timelimit_td'] = slurm['Timelimit'].apply(parse_time_string)
    slurm['Elapsed_td'] = slurm['Elapsed'].apply(parse_time_string)

    
    # Underutilized columns calculation 
    data['UnderUtilizeCPU'] = data.ReqCPUS - data.AllocCPUS    
    data['UnderUtilizeNodes'] = data.ReqNodes - data.NNodes  
    data['UnderUtilizeTime'] = (data['Timelimit_td'] - data['Elapsed_td']) / pd.Timedelta(hours=1)   # Convert UnderUtilizeTime to hours for comparison
    
    # Filter based on the specific thresholds
    underutilized = data[(data['UnderUtilizeTime'] > time_threshold) | 
                         (data['UnderUtilizeCPU'] > cpu_threshold) | 
                         (data['UnderUtilizeNodes'] > nodes_threshold)]

    
    return underutilized[['User','JobID','Group','State','Account','UnderUtilizeCPU','UnderUtilizeCPU','UnderUtilizeTime']]

In [121]:
FindUnterutilizerSLURM(slurm,7,0,0)

Unnamed: 0,Timelimit_td,User,JobID,Group,State,Account,UnderUtilizeCPU,UnderUtilizeCPU.1,UnderUtilizeTime
4,2 days 00:00:00,user_188,59062820,group_046,COMPLETED,account_017,0,0,47.997500
5,2 days 00:00:00,user_188,59062828,group_046,COMPLETED,account_017,0,0,47.998333
6,2 days 00:00:00,user_188,59062836,group_046,COMPLETED,account_017,0,0,47.998333
7,2 days 00:00:00,user_188,59062844,group_046,COMPLETED,account_017,0,0,47.997500
8,2 days 00:00:00,user_188,59062852,group_046,COMPLETED,account_017,0,0,47.997500
...,...,...,...,...,...,...,...,...,...
6375757,7 days 00:00:00,user_683,99253928,group_096,PREEMPTED,account_002,0,0,159.340278
6375758,7 days 00:00:00,user_186,99253932,group_003,COMPLETED,account_017,0,0,167.994722
6375764,7 days 00:00:00,user_683,99253956,group_096,PREEMPTED,account_002,0,0,159.340556
6375779,4 days 04:00:00,user_060,99254020,group_154,COMPLETED,account_017,0,0,87.934167
