In [10]:
# Numpy
import numpy as np
import pandas as pd
%matplotlib nbagg
import matplotlib.pyplot as plt
from matplotlib import gridspec
import scipy.stats

# Machine learning
import tensorflow as tf
import keras
import sklearn
import sklearn.preprocessing, sklearn.base, sklearn.utils, sklearn.model_selection

# Various Python tricks and libraries
import requests
import time
import functools
import operator
import collections
from tqdm import tqdm, tqdm_notebook, tnrange
import dill as pickle
import IPython

# Parallel
import joblib

In [12]:
def read_log():
    jobs = pd.read_csv(filepath_or_buffer='ml_queue.log', sep='\s+', header='infer', skiprows=[1], na_values=['UNLIMITED','Unknown','kijana','root','crabb','ericfadel']).dropna()

    jobs.loc[:, 'Submit'] = pd.to_datetime(jobs.loc[:, 'Submit'].copy(), errors='coerce')
    jobs.loc[:, 'Start'] = pd.to_datetime(jobs.loc[:, 'Start'].copy(), errors='coerce')
    jobs.loc[:, 'End'] = pd.to_datetime(jobs.loc[:, 'End'].copy(), errors='coerce')

    jobs.loc[:, 'NNodes'] = pd.to_numeric(jobs.loc[:, 'NNodes'].copy(), errors='coerce', downcast='integer')

    jobs.loc[:, 'Timelimit'] = jobs.loc[:, 'Timelimit'].copy().str.replace('-','day ')
    jobs.loc[:, 'Timelimit'] = pd.to_timedelta(jobs.loc[:, 'Timelimit'].copy(), errors='coerce')
    
    jobs.loc[:, 'Waited'] = (jobs.Start - jobs.Submit).values / pd.Timedelta('1h')

#     jobs = jobs[(np.abs(scipy.stats.zscore(jobs.loc[:,'Wait'].values / pd.Timedelta('1h'))) <3)]
    jobs = jobs.dropna()
    
    jobs = jobs.sort_values(by = 'Submit')
    
    return jobs

In [13]:
def sample_intelliremove_numerize(self, jobs, i):

    # sample
    
    tj = this_job     = jobs.iloc[i, :].copy()
    now = tj.Submit
    HOUR = pd.Timedelta('1 hour')
    
    rj = related_jobs = jobs.loc[np.logical_and(jobs.Submit <= now, jobs.Start >= now)]
    
    s  = sample       = pd.DataFrame(index = rj.index)

    # intelliremove i) i < self.TIMESTEPS, ii) some jobs may finish after now but are truncated, iii) available cores > nnodes
    
    if i < self.TIMESTEPS:
        return None
    
    if now < jobs.End.min():
        return None
    
    if len(rj) == 0:
        return None
    
    on  = occupied_nodes    = rj.NNodes.sum()
    if on <= 31:
        return None
    
    
    # numerize
    
    s.loc[:, 'User']        = self.U_lbe.transform(rj.User)

    s.loc[:, 'Submitted']   = now - rj.Submit
    s.loc[:, 'Submitted']   = s.Submitted.values / HOUR

    s.loc[:, 'NNodes']      = rj.NNodes

    s.loc[:, 'Timelimit']   = rj.Timelimit.values / HOUR

    
    U = s.User.values

    X = s.loc[:, ['Submitted', 'NNodes', 'Timelimit']].values
    
    L = len(X)

    y = tj.Waited

    return U, X, L, y

class Sampler_intelliremover_numerizer():
    
    def __init__(self, TIMESTEPS):
        self.TIMESTEPS = TIMESTEPS
        self.U_lbe  = sklearn.preprocessing.LabelEncoder()
    
    def fit(self, jobs):
        self.U_lbe.fit(jobs.User.values)
        return self
    
    def transform(self, jobs): 
        L = joblib.Parallel(n_jobs=20)(joblib.delayed(sample_intelliremove_numerize)(self, jobs, i) 
                                               for i in tnrange(0, len(jobs), 1, desc=self.__class__.__name__))
        L = [_ for _ in L if _]
        U, X, L, y = zip(*L)
        return U, X, L, y

In [14]:
class Outlierremover():

    def fit(self, U, X, L, y):
        pd_y = pd.to_numeric(pd.Series(y))

        self.y_quantile_05 = pd_y.quantile(0.05)
        self.y_quantile_95 = pd_y.quantile(0.95)
        
        return self
        
    def transform(self, U, X, L, y):
        array_U = np.array(U)
        array_X = np.array(X)
        int_L = np.int32(L)
        float_y = np.float32(y)

        o_mask = np.logical_and(self.y_quantile_05 < float_y, float_y < self.y_quantile_95)
        o_U = array_U[o_mask]
        o_X = array_X[o_mask]
        o_L = int_L[o_mask]
        o_y = float_y[o_mask]

        return o_U, o_X, o_L, o_y

In [15]:
class Scaler_padder_reshaper():
    
    def __init__(self):
        self.X_scaler = sklearn.preprocessing.StandardScaler()
        self.y_scaler = sklearn.preprocessing.StandardScaler()
        
    def fit(self, U, X, L, y):
        
        cat_X = np.concatenate(X, axis=0)
        
        self.X_scaler.fit(
            cat_X.reshape(-1, cat_X.shape[-1])
        )
        self.y_scaler.fit(
            y.reshape(-1, 1)
        )
        
        return self
        
    def transform(self, U, X, L, y):
        
        # scale
        
        scale_X = [self.X_scaler.transform(_) for _ in X]
        scale_y = self.y_scaler.transform(y.reshape(-1, 1))
        
        # pad
        
        pad_U = keras.preprocessing.sequence.pad_sequences(U, maxlen=None, dtype='int', padding='post')
        pad_X = keras.preprocessing.sequence.pad_sequences(scale_X, maxlen=None, dtype='float', padding='post')
        
        # reshape
        
        reshape_L = L.reshape(-1)
        
        return pad_U, pad_X, reshape_L, scale_y
    
    def inverse_transform(self, yhat):
        float_yhat = np.float32(yhat)
        
        return self.y_scaler.inverse_transform(float_yhat.reshape(-1, 1)).reshape(-1)
    

In [16]:
class Regressor():
    
    def graph(self, _U, _X, _L, _y, keep_prob, vocabulary_size = 9, embeddings_size = 3):

        U = tf.placeholder(name="U", dtype=tf.int32, shape=(None, None))
        U_embeddings = tf.get_variable(name="U_embeddings", shape=[vocabulary_size, embeddings_size])
        U_embedded = tf.nn.embedding_lookup(U_embeddings, U, name="U_embedded")

        X = tf.placeholder(name="X", dtype=tf.float32, shape=(None, None, _X.shape[-1]))
        XU = tf.concat([X,U_embedded], axis=-1, name="XU")
        XU.set_shape([None, None, X.shape[-1] + embeddings_size])
        
        L = tf.placeholder(name="L", dtype=tf.int32, shape=(None,))

        y = tf.placeholder(name="y", dtype=tf.float32, shape=(None, _y.shape[-1]))

        h1s, h1 = tf.nn.dynamic_rnn(
            tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell(8),
                    input_keep_prob = 1,
                    output_keep_prob = keep_prob,
                    state_keep_prob = keep_prob
                ),
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell(6),
                    input_keep_prob = 1,
                    output_keep_prob = 1,
                    state_keep_prob = keep_prob
                ),
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell(4),
                    input_keep_prob = 1,
                    output_keep_prob = 1,
                    state_keep_prob = keep_prob
                )
            ]), 
            XU, 
            sequence_length = L,
            dtype=tf.float32, scope="h1")
        yhat = tf.layers.dense(h1[-1], units=1, activation=None, name="yhat")

        loss = tf.reduce_mean(tf.square(yhat - y), keepdims=False, name="loss")
        training_op = tf.train.AdamOptimizer(learning_rate=0.005).minimize(loss, name="training_op")

        return U, X, L, y, yhat, training_op
    
    def fit(self, _U, _X, _L, _y, cont, keep_prob, n_epochs = 100, batch_size = 256):
        tf.reset_default_graph()
        U, X, L, y, yhat, training_op = self.graph(_U, _X, _L, _y, keep_prob = keep_prob)
        
        sess = tf.Session()
        if cont:
            tf.train.Saver().restore(sess, './ckpt')
        else:
            tqdm.write('Re-initializing variables...')
            sess.run(tf.global_variables_initializer())
        for epoch in tnrange(n_epochs * len(_X) / batch_size, desc=self.__class__.__name__):
            index = np.random.choice(range(len(_X)), batch_size, replace = False)
            sess.run(training_op, feed_dict = {U: _U[index], X: _X[index], L: _L[index], y: _y[index]})
        tf.train.Saver().save(sess, './ckpt')
        sess.close()
        
        return self
        
    def transform(self, _U, _X, _L, _y):
        
        tf.reset_default_graph()
        U, X, L, y, yhat, training_op = self.graph(_U, _X, _L, _y, keep_prob = 1.0)
        
        sess = tf.Session()
        tf.train.Saver().restore(sess, './ckpt')
        yhat_ = sess.run(yhat, feed_dict = {U: _U, X: _X, L:_L, y: _y})
        sess.close()
        return yhat_

In [17]:
class Pipeline():
    
    def __init__(self, ETs):
        self.ETs = Estimators_Transformers = ETs
      
    def fit(self, *args):
        '''Note: does not take keyworded input.'''
        for ET in self.ETs:
            args = ET.fit(*args).transform(*args)
        return self

    def transform(self, *args):
        for ET in self.ETs:
            args = ET.transform(*args)
        return args
    
    def transforms(self, *args):
        ET_s = []
        for ET in self.ETs:
            args = ET.transform(*args)
            ET_s.append(args)
        return ET_s
    
    def fit_transform(self, *args):
        for ET in self.ETs:
            args = ET.fit(*args).transform(*args)
        return args
    
    def fit_transforms(self, *args):
        ET_s = []
        for ET in self.ETs:
            args = ET.fit(*args).transform(*args)
            ET_s.append(args)
        return ET_s

In [9]:
jobs = read_log()

sin = Sampler_intelliremover_numerizer(TIMESTEPS = 250)
o  = Outlierremover()
spr = Scaler_padder_reshaper()
p = Pipeline([sin, o, spr])
r = Regressor()

_, (o_U1, o_X1, o_L1, o_y1), (p_U1, p_X1, p_L1, p_y1) = p.fit_transforms(jobs.iloc[np.r_[:5000]])
_, (o_U2, o_X2, o_L2, o_y2), (p_U2, p_X2, p_L2, p_y2) = p.transforms(jobs.iloc[5000:6000])

# diagnostic log
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4.2))
ax2.set_aspect('equal', adjustable='datalim')
line11, line12 = [], []

for _ in tnrange(40):

    spr_yhat1 = spr.inverse_transform(r.fit(p_U1, p_X1, p_L1, p_y1, cont = (_ > 0), keep_prob=.98, n_epochs = 10, batch_size=256).transform(p_U1, p_X1, p_L1, p_y1))
    spr_yhat2 = spr.inverse_transform(r.transform(p_U2, p_X2, p_L2, p_y2))
    
    # diagnostic log
    line11.append(sklearn.metrics.r2_score(o_y1, spr_yhat1))
    line12.append(sklearn.metrics.r2_score(o_y2, spr_yhat2))
    ax1.clear()
    ax1.plot(line11, label='P(E1)=%.2f'%line11[-1])
    ax1.plot(line12, label='P(E2)=%.2f'%line12[-1])
    ax1.legend(loc='best')
    ax2.clear()
    ax2.scatter(o_y1, spr_yhat1, color='green', s=1, alpha=0.5, label='E1')
    ax2.scatter(o_y2, spr_yhat2, color='red', s=1, alpha=0.5, label='E2')
    ax2.legend(loc='best')
    fig.canvas.draw()
    
    

<IPython.core.display.Javascript object>

Re-initializing variables...


Exception in thread Thread-12:
Traceback (most recent call last):
  File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/usr/local/lib/python2.7/dist-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python2.7/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



INFO:tensorflow:Restoring parameters from ./ckpt
INFO:tensorflow:Restoring parameters from ./ckpt
INFO:tensorflow:Restoring parameters from ./ckpt


INFO:tensorflow:Restoring parameters from ./ckpt
INFO:tensorflow:Restoring parameters from ./ckpt
INFO:tensorflow:Restoring parameters from ./ckpt


INFO:tensorflow:Restoring parameters from ./ckpt
INFO:tensorflow:Restoring parameters from ./ckpt
INFO:tensorflow:Restoring parameters from ./ckpt


KeyboardInterrupt: 