## This file will walk you through the steps of preprocessing for GPFS dataset and SLURM dataset

In [15]:
import pandas as pd
from datetime import timedelta

# GPFS

In [2]:
column_names = ["Inode (file unique ID)",
"KB Allocated",
"File Size",
"Creation Time in days from today",
"Change Time in days from today",
"Modification time in days from today",
"Acces time in days from today",
"GID numeric ID for the group owner of the file",
"UID numeric ID for the owner of the file"]
GPFS = pd.read_csv("/mnt/research/CMSE495-SS24-ICER/file_system_usage/gpfs-stats/inode-size-age-jan-23",header=None, names = column_names, sep=" ",nrows=1000)
GPFS

Unnamed: 0,Inode (file unique ID),KB Allocated,File Size,Creation Time in days from today,Change Time in days from today,Modification time in days from today,Acces time in days from today,GID numeric ID for the group owner of the file,UID numeric ID for the owner of the file
0,100663296,0,8,1447,1447,3131,1447,2035,762231
1,100663297,0,188,1447,1447,1937,1447,2010,614955
2,100663301,0,567,1447,1447,3142,1447,2035,762231
3,100663304,0,87,1447,1447,3142,1447,2035,762231
4,100663306,0,1689,1447,1447,1937,1447,2010,614955
...,...,...,...,...,...,...,...,...,...
995,100664765,128,78949,1447,1447,2961,1447,2392,831677
996,100664766,64,24666,1447,1447,1632,1447,2023,881083
997,100664767,0,530,1447,1093,2143,1050,2022,500120
998,100664768,0,99,1447,1447,1937,1447,2010,614955


In [4]:
# Now, let's export this subset to a CSV file
# GPFS.to_csv('/mnt/home/alkhali7/ICER-UserDataAnalytics/ICER_user_data_analytics/data/gpfs_sample_data.csv')

# SLURM data, rows=100K

In [6]:
slurm = pd.read_csv("/mnt/research/CMSE495-SS24-ICER/slurm_usage/DID_FINAL_SLURM_OCT_2023.csv",delimiter="|", nrows=100000)

# data preprocessing
slurm = slurm.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

# # Filter out rows where 'State' is "Cancelled" or Unknown
slurm = slurm[slurm['State'] != 'Cancelled']
slurm = slurm[slurm['Start']!= 'Unknown']

slurm['Submit'] = pd.to_datetime(slurm['Submit'])
slurm['Start'] = pd.to_datetime(slurm['Start'])

In [7]:
# remove user_258 and clean jobs data
def AggSLURMDat(dat):
    '''
    Aggregates all submitted jobs together, removing all batch/extern 
    jobs and including said information into a single job. Excludes
    jobs that do not have a clear '.batch' and '.extern' files

    args:

        dat - the slurm dataset 
    
    returns:

        out_df - the aggregated version of the slurm dataset
    '''
    
    job_list = dat["JobID"].value_counts().index

    out_df = pd.DataFrame(columns=dat.keys())

    for job in job_list:

        jdat = dat[dat["JobID"] == job]

        cpu_time_list = jdat["CPUTimeRAW"].value_counts()
        cpu_time_list = cpu_time_list[cpu_time_list == 2].index

        for cpu_time in cpu_time_list:

            ajob = jdat[jdat["CPUTimeRAW"] == cpu_time]

            batch_job = ajob[ajob["User"] == "user_258"]

            ag_job = ajob[ajob["User"] != "user_258"]

            if len(ag_job["User"]) == 0:
#                 print("Weird Job",ajob["JobID"])
#                 print("No aggregate job")
                continue
            
            if len(ag_job["User"]) == 2:
#                 print("Weird Job",ajob["JobID"])
#                 print("2 copies of aggregate job")
                continue

            assert len(ag_job["User"]) == 1, "New edge case discovered!"

            ag_job.loc[ag_job.index[0],"MaxRSS"] = batch_job["MaxRSS"].values[0]

            out_df = pd.concat([out_df,ag_job])

    return out_df

In [8]:
slurm_clean = AggSLURMDat(slurm)

In [10]:
# remove na rows in MaxRSS after cleaning data
slurm_clean= slurm_clean.dropna(subset=['MaxRSS'])
slurm_clean

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,CPUTimeRAW,ReqCPUS,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES
75766,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:11:08,00:02:38,COMPLETED,account_017,assocID_554,...,158,1,1,10G,348380K,1,1,lac-351,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
75880,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:09:18,00:00:48,FAILED,account_017,assocID_554,...,48,1,1,10G,8156K,1,1,lac-367,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
77026,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:10:33,00:02:02,FAILED,account_017,assocID_554,...,122,1,1,10G,344968K,1,1,lac-295,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
77281,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:11:31,00:03:00,COMPLETED,account_017,assocID_554,...,180,1,1,10G,489700K,1,1,lac-299,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
76963,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:09:36,00:01:05,FAILED,account_017,assocID_554,...,65,1,1,10G,270400K,1,1,lac-294,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29569,91740004,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:32,2023-10-01T00:46:35,00:07:03,COMPLETED,account_030,assocID_561,...,1692,4,4,8G,531208K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"
29572,91740008,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:38,2023-10-01T00:46:42,00:07:04,COMPLETED,account_030,assocID_561,...,1696,4,4,8G,528908K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"
29575,91740012,user_128,group_062,2023-10-01 00:38:00,2023-10-01 00:39:43,2023-10-01T00:45:59,00:06:16,COMPLETED,account_030,assocID_561,...,1504,4,4,8G,490392K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"
29530,91739952,user_128,group_062,2023-10-01 00:37:49,2023-10-01 00:38:49,2023-10-01T00:50:17,00:11:28,COMPLETED,account_030,assocID_561,...,2752,4,4,8G,747872K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1"


In [11]:
# remove na rows in MaxRSS after cleaning data
slurm_clean= slurm_clean.dropna(subset=['MaxRSS'])

In [12]:
# # convert ReqMem to a uniform measurement (assuming 'M' for MB and 'G' for GB and 'K' for KB)
def convert_memory(mem_str):
    '''
    Convert memory units to MegaBytes unit float.
    '''
    if mem_str.endswith('M'):
        return float(mem_str[:-1]) # remove 'M' and convert to float
    elif mem_str.endswith('K'):
        return float(mem_str[:-1]) / 1000
    elif mem_str.endswith('G'):
        return float(mem_str[:-1]) * 1e3  # convert MB to KB
    elif mem_str.endswith('T'):
        return float(mem_str[:-1]) * 1e6 # convert MB to T
    

In [13]:
slurm_clean['ReqMem_MB'] = slurm_clean['ReqMem'].apply(convert_memory)
slurm_clean['MaxRSS_MB'] = slurm_clean['MaxRSS'].apply(convert_memory)

In [16]:
# Function to parse time data columns: TimeLimit and Elapsed
def parse_time_string(time_str):
    """Convert a time string into a timedelta object."""
    days = 0
    if '-' in time_str:
        days, time_str = time_str.split('-')
        days = int(days)

    parts = time_str.split(':')
    hours, minutes, seconds = map(int, parts) if len(parts) == 3 else (int(parts[0]), int(parts[1]), 0)
    return timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)

# Convert Timelimit and Elapsed to timedeltas
slurm_clean['Timelimit'] = slurm_clean['Timelimit'].apply(parse_time_string)
slurm_clean['Elapsed'] = slurm_clean['Elapsed'].apply(parse_time_string)

In [17]:
slurm_clean

Unnamed: 0,JobID,User,Group,Submit,Start,End,Elapsed,State,Account,AssocID,...,AllocCPUS,ReqMem,MaxRSS,ReqNodes,NNodes,NodeList,ReqTRES,AllocTRES,ReqMem_MB,MaxRSS_MB
75766,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:11:08,0 days 00:02:38,COMPLETED,account_017,assocID_554,...,1,10G,348380K,1,1,lac-351,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,348.380
75880,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:30,2023-10-01T10:09:18,0 days 00:00:48,FAILED,account_017,assocID_554,...,1,10G,8156K,1,1,lac-367,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,8.156
77026,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:10:33,0 days 00:02:02,FAILED,account_017,assocID_554,...,1,10G,344968K,1,1,lac-295,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,344.968
77281,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:11:31,0 days 00:03:00,COMPLETED,account_017,assocID_554,...,1,10G,489700K,1,1,lac-299,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,489.700
76963,91806820,user_316,group_009,2023-10-01 10:08:04,2023-10-01 10:08:31,2023-10-01T10:09:36,0 days 00:01:05,FAILED,account_017,assocID_554,...,1,10G,270400K,1,1,lac-294,"billing=1556,cpu=1,mem=10G,node=1","billing=1556,cpu=1,mem=10G,node=1",10000.0,270.400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29569,91740004,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:32,2023-10-01T00:46:35,0 days 00:07:03,COMPLETED,account_030,assocID_561,...,4,8G,531208K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1",8000.0,531.208
29572,91740008,user_128,group_062,2023-10-01 00:37:59,2023-10-01 00:39:38,2023-10-01T00:46:42,0 days 00:07:04,COMPLETED,account_030,assocID_561,...,4,8G,528908K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1",8000.0,528.908
29575,91740012,user_128,group_062,2023-10-01 00:38:00,2023-10-01 00:39:43,2023-10-01T00:45:59,0 days 00:06:16,COMPLETED,account_030,assocID_561,...,4,8G,490392K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1",8000.0,490.392
29530,91739952,user_128,group_062,2023-10-01 00:37:49,2023-10-01 00:38:49,2023-10-01T00:50:17,0 days 00:11:28,COMPLETED,account_030,assocID_561,...,4,8G,747872K,1,1,amr-162,"billing=1245,cpu=4,mem=8G,node=1","billing=1245,cpu=4,mem=8G,node=1",8000.0,747.872


In [18]:
# Now, let's export this subset to a CSV file
# slurm_clean.to_csv('/mnt/home/alkhali7/ICER-UserDataAnalytics/ICER_user_data_analytics/data/slurm_sample_data.csv')