In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [14]:
column_names = ["Inode (file unique ID)",
"KB Allocated",
"File Size",
"Creation Time in days from today",
"Change Time in days from today",
"Modification time in days from today",
"Acces time in days from today",
"GID numeric ID for the group owner of the file",
"UID numeric ID for the owner of the file"]
GPFS = pd.read_csv("/mnt/research/CMSE495-SS24-ICER/file_system_usage/gpfs-stats/inode-size-age-jan-23",header=None, names = column_names, sep=" ",nrows=1e7)
GPFS

Unnamed: 0,Inode (file unique ID),KB Allocated,File Size,Creation Time in days from today,Change Time in days from today,Modification time in days from today,Acces time in days from today,GID numeric ID for the group owner of the file,UID numeric ID for the owner of the file
0,100663296,0,8,1447,1447,3131,1447,2035,762231
1,100663297,0,188,1447,1447,1937,1447,2010,614955
2,100663301,0,567,1447,1447,3142,1447,2035,762231
3,100663304,0,87,1447,1447,3142,1447,2035,762231
4,100663306,0,1689,1447,1447,1937,1447,2010,614955
...,...,...,...,...,...,...,...,...,...
9999995,124522107,64,5739,15,15,146,2,2005,6052828
9999996,124522107,64,5739,15,15,146,2,2005,6052828
9999997,124522108,0,1051,15,15,161,15,2005,6052828
9999998,124522108,0,1051,15,15,161,15,2005,6052828


In [15]:
GPFS.nunique()

Inode (file unique ID)                            9283343
KB Allocated                                         3907
File Size                                          528762
Creation Time in days from today                     1172
Change Time in days from today                       1176
Modification time in days from today                 6995
Acces time in days from today                        1259
GID numeric ID for the group owner of the file         40
UID numeric ID for the owner of the file               52
dtype: int64

# Identifiying file unique ID that are underutilizing File space

In [16]:
#  file size -  kb allocated = allocated memory not used
# the higher the # the higher allocated memory not used

# file size is in bytes, KB Allocated is in Kilo Bytes
# GPFS["underutilizerBytes"] = GPFS['File Size'] - (GPFS['KB Allocated'] + 3000)/1000
# GPFS

In [17]:
# identify UID (owner of the file) that underutilize memory going off file size and kb allocation
# threshold= 2 million bytes
# GPFS[GPFS.underutilizerBytes>2000000].head()


In [18]:
# function 
def FindUnterutilizerGPFS(data, threshold):
    """
    Identifies and returns entries for files where the actual utilization is significantly 
    lower than the allocated resources, based on a given threshold. This helps in identifying 
    inefficient resource usage in GPFS (General Parallel File System).

    The function calculates the underutilization by subtracting the kilobytes allocated 
    for each file (adjusted by a fixed overhead of 3000 KB to bytes) from the file size in bytes. 
    It then filters and returns the entries where this underutilized value exceeds a specified threshold.

    Parameters:
        data (pd.DataFrame): A pandas DataFrame containing the dataset with at least 
                             'File Size' and 'KB Allocated' columns. 'File Size' should be in bytes,
                             and 'KB Allocated' should be in kilobytes.
        threshold (int or float): A numeric threshold for the 'underutilized (Bytes)' value. Entries with
                                  underutilization greater than this threshold will be returned.

    Returns:
        pd.DataFrame: A DataFrame containing the rows from the input DataFrame where the underutilization
                      (calculated as 'File Size' in bytes minus 'KB Allocated' in bytes, including a fixed
                      overhead) exceeds the specified threshold.

    Note:
        - The function adds a fixed overhead of 3000 KB to the 'KB Allocated' before conversion and 
          calculation to account for system or operational reserves that might not be reflected 
          in the raw 'KB Allocated' values.
        - Ensure that the input DataFrame contains the necessary columns with appropriate units 
          as described.
    """
    
    # Conversion of KB Allocated to bytes and adjustment for overhead, then calculation of underutilized space
    data["underutilized (Bytes)"] = data['File Size'] - (data['KB Allocated'] * 1000 + 3000)
    
    # Filter and return rows where underutilization exceeds the threshold
    out = data[data['underutilized (Bytes)'] > threshold]
    
    return out


In [19]:
# Test Function
FindUnterutilizerGPFS(GPFS, 2e7)

Unnamed: 0,Inode (file unique ID),KB Allocated,File Size,Creation Time in days from today,Change Time in days from today,Modification time in days from today,Acces time in days from today,GID numeric ID for the group owner of the file,UID numeric ID for the owner of the file,underutilized (Bytes)
1057,100664883,32768,68635285,1447,1447,2525,1447,2069,1000000092,35864285
4710,100669478,24576,47933838,1447,1447,2516,1447,2069,1000000092,23354838
4832,100669753,16384,37347520,1447,1447,2516,1447,2069,1000000092,20960520
8323,100674343,32768,111389468,239,239,239,239,2023,881083,78618468
8352,100674372,32768,111450902,239,239,239,239,2023,881083,78679902
...,...,...,...,...,...,...,...,...,...,...
9894092,123933851,24576,60585589,911,911,1030,911,2005,758827,36006589
9894142,123933901,16384,58007891,911,911,1182,911,2005,758827,41620891
9992619,124488521,690752,1026060288,911,911,916,911,2005,758827,335305288
9992626,124488528,14557184,18643510325,911,911,916,911,2005,758827,4086323325
