In [1]:
## Data Exploration with skx_normal queue data for stampede2 Feb 1, 2022 - August 1, 2022 (not including Aug 1,2022)
import sys
sys.path.append('/home/jovyan/work/')
sys.path.append('/home/jovyan/work/src')
sys.path.append('/home/jovyan/work/src/data')

In [2]:
from data import create_input_data
df_feb = create_input_data.read_data(csv_file_name="../data/raw/skx_anon_jobs_1Feb2022_1Aug2022_normal_sorted.csv", parse_dates_col=[4,5,6])

In [3]:
df_feb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199744 entries, 0 to 199743
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   jobid             199744 non-null  int64         
 1   user              199744 non-null  object        
 2   account           199744 non-null  object        
 3   state             199744 non-null  object        
 4   submit            199744 non-null  datetime64[ns]
 5   start             199744 non-null  datetime64[ns]
 6   end               199744 non-null  datetime64[ns]
 7   reqcpus           199744 non-null  int64         
 8   nnodes            199744 non-null  int64         
 9   max_minutes       199744 non-null  int64         
 10  queue_minutes     199744 non-null  int64         
 11  backlog_minutes   199744 non-null  int64         
 12  backlog_num_jobs  199744 non-null  int64         
 13  running_num_jobs  199744 non-null  int64         
 14  runn

In [4]:
import numpy as np 
# df = df_jan
df = df_feb

# How many jobs sat in queue for 0 minutes?
nbr_jobs = len(df)
nbr_zero_queue_min_jobs = len(df[df.queue_minutes == 0])
small_minutes_threshhold = 5
nbr_small_queue_min_jobs = len(df[df.queue_minutes <= small_minutes_threshhold])
print("Initial total number of jobs: ", nbr_jobs)
print("Jobs with 0 queue minutes: ", nbr_zero_queue_min_jobs)
print(f"Jobs with queue minutes leq {small_minutes_threshhold}: ", nbr_small_queue_min_jobs)
print("Jobs with queue minutes: ", nbr_zero_queue_min_jobs)
print("Percentage of jobs with 0 queue minutes: ", float(nbr_zero_queue_min_jobs)/nbr_jobs)

# Optionally, drop the rows (jobs) that sat in queue for 0 minutes 
DROP_ZERO_QUEUE_MINUTES = True
if DROP_ZERO_QUEUE_MINUTES:
    df = df.drop(df[df.queue_minutes == 0].index)
    min_queue_minutes = df['queue_minutes'].min()
    print("After dropping zero minute jobs, shortest queue time is: ", min_queue_minutes)

print("Jobs with 0 queue minutes: ", nbr_zero_queue_min_jobs)# How many jobs sat in queue for > more than x days?
NBR_DAYS_HIGH = 2
nbr_minutes_high = NBR_DAYS_HIGH * 24 * 60 
nbr_high_queue_min_jobs = len(df[df.queue_minutes >= nbr_minutes_high])
print("HIGH number of days threshold: ", NBR_DAYS_HIGH, " (", nbr_minutes_high, " minutes)")
print("Jobs with HIGH queue minutes: ", nbr_high_queue_min_jobs)
print("Percentage of jobs with HIGH queue minutes: ", float(nbr_high_queue_min_jobs)/nbr_jobs)

# Optionally, drop the rows (jobs) that sat in queue for 0 minutes 
DROP_HIGH_QUEUE_MINUTES = True
if DROP_HIGH_QUEUE_MINUTES:
    df = df.drop(df[df.queue_minutes >= nbr_minutes_high].index)

print("Final total number of jobs: ", len(df))

# Add a new column, queue_minutes_bin, which is the bin of the 
bin_threshhold = 10
bin_size_factor = 6 # implies bins of size 60 min
bin_size = bin_threshhold * bin_size_factor
nbr_bins = 5

df['queue_minutes_bin'] = df['queue_minutes'] / bin_size
df = df.astype({'queue_minutes_bin': 'int'})
df['queue_minutes_bin'].unique()
# for jobs in a bin number larger than the number of bins -1 (bins are 0-indexed), just 
# put them in the largest bin. 
df['queue_minutes_bin'] = np.where(df['queue_minutes_bin'] > (nbr_bins - 1), (nbr_bins - 1), df['queue_minutes_bin'])
# print the final bin counts
df['queue_minutes_bin'].value_counts()

Initial total number of jobs:  199744
Jobs with 0 queue minutes:  57935
Jobs with queue minutes leq 5:  89998
Jobs with queue minutes:  57935
Percentage of jobs with 0 queue minutes:  0.2900462592117911
After dropping zero minute jobs, shortest queue time is:  1
Jobs with 0 queue minutes:  57935
HIGH number of days threshold:  2  ( 2880  minutes)
Jobs with HIGH queue minutes:  5831
Percentage of jobs with HIGH queue minutes:  0.029192366228772828
Final total number of jobs:  135978


queue_minutes_bin
0    75668
4    34888
1    13259
2     7206
3     4957
Name: count, dtype: int64

In [12]:
import numpy as np 
def split_df_current_future(df, cutoff_fraction=0.75, cutoff_datetime=None):
    """
    Split a jobs dataframe into a current and future set. 
      * cutoff_fraction (float): the fraction to use for current.
      * cutoff_datetime, if supplied, should be the date_time to use as the cutoff point.
        When supplied, cutoff_fraction is ignored.
 
    Returns two dataframes, current and future
    """
    # if cutoff_datetime is provided, just use that 
    if cutoff_datetime: 
        current = df[df['submit'] <= cutoff_datetime]
        future = df[df['submit'] > cutoff_datetime]
        return current, future 
    # otherwise, we are using cutoff_fraction. 
    if not 0 < cutoff_fraction < 1:
        print("Invalid cutoff_fraction; values should be between 0 and 1.")
        return None, None 
    # the following uses np.quantile to split the df on the submit column using the cutoff_fraction
    current = df[df['submit']<=np.quantile(df['submit'], cutoff_fraction )]
    future = df[df['submit']>np.quantile(df['submit'], cutoff_fraction )]
    return current, future

In [13]:
current, future = split_df_current_future(df, cutoff_fraction=0.75, cutoff_datetime="2022-07-01")
print("Current: ", len(current), " Future: ", len(future))
print("Max current: ", current["submit"].max(), " Min future: ", future['submit'].min())

Current:  117359  Future:  18619
Max current:  2022-06-30 23:42:18  Min future:  2022-07-01 00:42:53


In [14]:
# These are the columns of the independent and dependent vars, repsectively:
X_cols = ['nnodes', 
          'max_minutes', 
          'backlog_minutes', 
          'backlog_num_jobs', 
          'running_num_jobs', 
          'running_minutes',
         ]
y_col = 'queue_minutes_bin'

# whether to train on the full dataset (after splitting) or to use the "current"
TRAIN_ON_FULL = True 

# Train-test split --------
from sklearn.model_selection import train_test_split
# Create independent and dependent vars
X = df[X_cols]
X_current = current[X_cols]
X_future = future[X_cols]
y = df[y_col]
y_current = current[y_col]
y_future = future[y_col]

if TRAIN_ON_FULL:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
else:
    X_train, X_test, y_train, y_test = train_test_split(X_current, y_current, test_size=0.2, stratify=y, random_state=1)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

lr = LogisticRegression(random_state=1, max_iter=1000)
p = pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('lr', lr),
])

# fit the pipeline 
p.fit(X_train, y_train)
model = p 
# or, fit the LG model by hand with no standardization 
# model = lr.fit(X_train, y_train)

# print the report
print(f"Performance on TEST\n*******************\n{classification_report(y_test, model.predict(X_test))}")
print(f"Performance on TRAIN\n********************\n{classification_report(y_train, model.predict(X_train))}")

Performance on TEST
*******************
              precision    recall  f1-score   support

           0       0.70      0.97      0.81     15134
           1       0.00      0.00      0.00      2652
           2       0.00      0.00      0.00      1441
           3       0.00      0.00      0.00       991
           4       0.81      0.72      0.76      6978

    accuracy                           0.72     27196
   macro avg       0.30      0.34      0.31     27196
weighted avg       0.60      0.72      0.65     27196

Performance on TRAIN
********************
              precision    recall  f1-score   support

           0       0.70      0.97      0.81     60534
           1       0.00      0.00      0.00     10607
           2       0.00      0.00      0.00      5765
           3       0.00      0.00      0.00      3966
           4       0.81      0.72      0.76     27910

    accuracy                           0.72    108782
   macro avg       0.30      0.34      0.31    10

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
