In [1]:
# Numpy
import numpy as np
import pandas as pd
import scipy.stats, scipy.interpolate, scipy.spatial

# matplotlib
%matplotlib nbagg
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.patches as patches

# plotly
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

# Machine learning
import tensorflow as tf
import keras
import sklearn
import sklearn.preprocessing, sklearn.base, sklearn.utils, sklearn.model_selection, sklearn.gaussian_process, sklearn.linear_model
import optunity
import statsmodels.nonparametric.smoothers_lowess

# Various Python tricks and libraries
import re
import requests
import time
import functools
import operator
import collections
from tqdm import tqdm, tqdm_notebook, tnrange
import dill as pickle
import IPython
import gc
import json
from collections import OrderedDict

# Parallel
import joblib
import multiprocessing


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.


In [2]:
class Minibatch(object):
    '''
    Makes batches from dataframes.
    Executes n_epochs before raising StopIteration.
    Allows manually setting aside a test set, or automatically randomly selecting one.
    Progress bar.
    Note: "train set" is preferred to "training set", e.g. train_index, X_train

    Variables:
    n_minibatches, i_minibatch
    n_epochs, @property i_epoch
    '''
    
    
    def __init__(self, df, minibatch_size, n_epochs, train_index=None, test_index=None, train_split=None, test_split=None, tqdm=True):
        self.minibatch_size = minibatch_size
        self.n_epochs = n_epochs
        N = len(df)
        all_index = range(N)
        
        # determine train_index and test_index
        # given index
        if train_index and test_index:
            pass
        elif train_index and not test_index:
            test_index = list(set(all_index) - set(train_index))
        elif test_index and not train_index:
            train_index = list(set(all_index) - set(test_index))
        # given split percentage
        elif train_split and test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            remaining_index = list(set(all_index) - set(train_index))
            test_index = np.random.choice(remaining_index, int(N * test_split), replace=False)
        elif train_split and not test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            test_index = list(set(all_index) - set(train_index))
        elif test_split and not train_split:
            test_index = np.random.choice(all_index, int(N * test_split), replace=False)   
            train_index = list(set(all_index) - set(test_index))
        else:
            raise Exception("Either specify index, or specify split.")
                            
        # generate train_df
        self.train_df = df.iloc[train_index]
        self.test_df = df.iloc[test_index]
        self.df = df

        # minibatch counter
        self.i_minibatch = 0
        self.n_minibatches = n_epochs * len(self.train_df) / minibatch_size
        
        if tqdm:
            self.tqdm = tqdm_notebook(total=self.n_minibatches, leave=False)
        
    def minibatch(self):
        if self.i_minibatch > self.n_minibatches:
            self.i_minibatch = 0
            raise StopIteration
        else:
            self.i_minibatch += 1
            
        if getattr(self, 'tqdm', None):
            self.tqdm.update(1)
        
        index = np.random.choice(range(len(self.train_df)), self.minibatch_size, replace=False)
        return self.train_df.iloc[index, :-1].values, self.train_df.iloc[index, -1].values.reshape(-1, 1)
    
    def train_set(self):
        return self.train_df.iloc[:, :-1].values, self.train_df.iloc[:, -1].values.reshape(-1, 1)
    
    def test_set(self):
        return self.test_df.iloc[:, :-1].values, self.test_df.iloc[:, -1].values.reshape(-1, 1)
    
    @property
    def i_epoch(self):
        # the number of epochs
        return float(self.i_minibatch) * self.n_epochs / self.n_minibatches

In [3]:
class BetterYhatLive(object):
    '''
    Plot (i_epoch, r2).
    Plot (y, yhat).
    '''
    
    def __init__(self, smoothen):
        self.fig, (self.ax_decay, self.ax_corr) = plt.subplots(1, 2, figsize=(14, 4.2))
        self.ax_corr.set_aspect('equal', adjustable='datalim')
        self.i_epochs, self.line_decay_train, self.line_decay_test = [], [], []
        self.smoothen = smoothen
        
    def update(self, i_epoch, y_train, yhat_train, y_test, yhat_test):
        self.i_epochs.append(i_epoch)
        self.line_decay_train.append(sklearn.metrics.r2_score(y_train, yhat_train))
        self.line_decay_test.append(sklearn.metrics.r2_score(y_test, yhat_test))
        # smoothen
        if self.smoothen:
            x = range(len(self.line_decay_train))
            smoothline_decay_train = statsmodels.nonparametric.smoothers_lowess.lowess(self.line_decay_train, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
            smoothline_decay_test = statsmodels.nonparametric.smoothers_lowess.lowess(self.line_decay_test, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
        else:
            smoothline_decay_train = self.line_decay_train
            smoothline_decay_test = self.line_decay_test
        #
        label_train = '$r^2_{train}$=%.2f'%self.line_decay_train[-1]
        label_test = '$r^2_{test}$=%.2f'%self.line_decay_test[-1]
        #
        self.ax_decay.clear()
        self.ax_decay.plot(self.i_epochs, smoothline_decay_train, label=label_train)
        self.ax_decay.plot(self.i_epochs, smoothline_decay_test, label=label_test)
        self.ax_decay.legend(loc='best')
        #
        self.ax_corr.clear()
        self.ax_corr.set_aspect('equal', adjustable='datalim')
        self.ax_corr.scatter(y_train, yhat_train, color='green', s=1, alpha=0.2, label='train')
        self.ax_corr.scatter(y_test, yhat_test, color='red', s=1, alpha=0.2, label='test')
        self.ax_corr.legend(loc='best')
        #
        self.fig.canvas.draw()

In [4]:
class LambdaGraph(object):
    
    def __init__(self):
        self.bs = OrderedDict()
        
    def add(self, b, label): # b is a BetterYhatLive instance
        self.bs[label] = np.float32(zip(b.i_epochs, b.line_decay_train, b.line_decay_test))
        
    def draw(self):
        traces = [
            go.Scatter(
                x = b[:, 1],
                y = b[:, 2],
                mode = 'markers',
                name = label
            )
            for label, b in self.bs.iteritems()
        ]
        py.iplot(traces)

In [5]:
# Boilerplate for plotting a dataframe. Do not attempt a function.
# traces = [
#     go.Scatter(x = df.index, y = df.loc[:, column], mode = 'markers+lines', name = column) 
#     for column in df.columns
# ]
# py.iplot(traces, filename='threshold_errors')   

# Get and tame the data

读logfile，使之compatible with逻辑定义。

In [6]:
def read_log():
    jobs = pd.read_csv(filepath_or_buffer='ml_queue.nanaimo.20170101-20180704.log', sep='\s+', header='infer', skiprows=[1], na_values=['UNLIMITED','Unknown']).dropna()

    jobs.loc[:, 'nodes'] = pd.to_numeric(jobs.loc[:, 'NNodes'].copy(), errors='coerce', downcast='integer')
    
    # job_name not implemented

    jobs.loc[:, 'Timelimit'] = jobs.loc[:, 'Timelimit'].copy().str.replace('-','day ')
    jobs.loc[:, 'eta'] = pd.to_timedelta(jobs.loc[:, 'Timelimit'].copy(), errors='coerce')

    jobs.loc[:, 'submit'] = pd.to_datetime(jobs.loc[:, 'Submit'].copy(), errors='coerce')
    
    jobs.loc[:, 'begin'] = pd.to_datetime(jobs.loc[:, 'Start'].copy(), errors='coerce')
    
    jobs.loc[:, 'end'] = pd.to_datetime(jobs.loc[:, 'End'].copy(), errors='coerce')
    
    jobs.loc[:, 'user'] = jobs.loc[:, 'User'].copy() # uid not implemented
    
    # subfile not implemented
    
    # scancelled not implemented
    
    jobs.loc[:, 'wait'] = (jobs.begin - jobs.submit).values / pd.Timedelta('1h')

    jobs = jobs.dropna()
    
    jobs = jobs.drop(columns=['User', 'NNodes', 'Timelimit', 'Submit', 'Start', 'End'])
    
    return jobs

In [7]:
jobs = read_log()

In [8]:
jobs.head()

Unnamed: 0,State,nodes,eta,submit,begin,end,user,wait
0,CANCELLED+,1,2 days,2017-08-01 09:42:07,2017-08-01 09:42:08,2017-08-02 23:03:56,nicola,0.000278
1,CANCELLED+,1,2 days,2017-08-01 09:43:03,2017-08-01 09:43:04,2017-08-02 23:03:57,nicola,0.000278
2,CANCELLED+,1,2 days,2017-08-01 09:43:51,2017-08-01 09:43:51,2017-08-02 23:03:59,nicola,0.0
3,CANCELLED+,1,2 days,2017-08-01 09:44:35,2017-08-01 09:44:35,2017-08-02 23:04:01,nicola,0.0
4,CANCELLED+,1,2 days,2017-08-01 09:45:28,2017-08-01 09:45:28,2017-08-02 23:04:04,nicola,0.0


# 可以直接开始

i实现为原始的index。注意删了一些job后，i不是连续的。

认为在提交的同时，发生的事情不知道。

初始100个job认为不可分析Q.

In [12]:
def SQ(i):
    # S: in service. Q: wait queue.
    
    if i < 100:
        raise Exception
    
    j = jobs.loc[i]
    
    S = jobs.loc[
        np.logical_and.reduce([
            jobs.begin < j.submit, 
            jobs.end >= j.submit
        ])
    ]
    
    Q = jobs.loc[
        np.logical_and.reduce([
            jobs.submit < j.submit, 
            jobs.begin >= j.submit
        ])
    ]
    
    return S, Q

In [13]:
def should_not_wait(i, C):
    S, Q = SQ(i)
    return S.nodes.sum() + Q.nodes.sum() + jobs.loc[i].nodes <= C

def did_not_wait(i, threshold):
    return jobs.loc[i].wait < threshold

Infer Threshold.

In [None]:
array_threshold_errors = []
C = 32

for threshold in tqdm_notebook([0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.05, 0.1]):

    valid_indices = [_ for _ in jobs.index if _>100]
    
    array_should_not_wait = multiprocessing.Pool(processes=20).map(
        functools.partial(should_not_wait, C=C),
        valid_indices
    )
    jobs.loc[valid_indices, 'should_not_wait'] = array_should_not_wait

    jobs.loc[valid_indices, 'did_not_wait'] = jobs.loc[valid_indices, 'wait'] < threshold
    
    should_not_wait_but_waited = np.sum(np.int_(np.logical_and(
        jobs.loc[valid_indices, 'should_not_wait'],
        np.logical_not(jobs.loc[valid_indices, 'did_not_wait'])
    )))
    
    should_wait_but_did_not = np.sum(np.int_(np.logical_and(
        np.logical_not(jobs.loc[valid_indices, 'should_not_wait']),
        jobs.loc[valid_indices, 'did_not_wait']
    )))
    
    should_not_wait_did_not_wait = np.sum(np.int_(np.logical_and(
        jobs.loc[valid_indices, 'should_not_wait'],
        jobs.loc[valid_indices, 'did_not_wait']
    )))
    
    array_threshold_errors.append([threshold, should_not_wait_but_waited, should_wait_but_did_not, should_not_wait_did_not_wait])
    
threshold_errors = pd.DataFrame(array_threshold_errors, columns=['threshold', 'should_not_wait_but_waited', 'should_wait_but_did_not', 'should_not_wait_did_not_wait'])
threshold_errors = threshold_errors.set_index('threshold')


In [15]:
df = threshold_errors
traces = [
    go.Scatter(x = df.index, y = df.loc[:, column], mode = 'markers+lines', name = column) 
    for column in df.columns
]
py.iplot(traces, filename='ml_queue_0803.sec2.fig1')   

Infer C.

In [16]:
array_C_errors = []
threshold = 0.0008

for C in tnrange(28, 38):

    valid_indices = [_ for _ in jobs.index if _>100]
    
    array_should_not_wait = multiprocessing.Pool(processes=20).map(
        functools.partial(should_not_wait, C=C),
        valid_indices
    )
    jobs.loc[valid_indices, 'should_not_wait'] = array_should_not_wait

    jobs.loc[valid_indices, 'did_not_wait'] = jobs.loc[valid_indices, 'wait'] < threshold
    
    should_not_wait_but_waited = np.sum(np.int_(np.logical_and(
        jobs.loc[valid_indices, 'should_not_wait'],
        np.logical_not(jobs.loc[valid_indices, 'did_not_wait'])
    )))
    
    should_wait_but_did_not = np.sum(np.int_(np.logical_and(
        np.logical_not(jobs.loc[valid_indices, 'should_not_wait']),
        jobs.loc[valid_indices, 'did_not_wait']
    )))
    
    should_not_wait_did_not_wait = np.sum(np.int_(np.logical_and(
        jobs.loc[valid_indices, 'should_not_wait'],
        jobs.loc[valid_indices, 'did_not_wait']
    )))
    
    array_C_errors.append([C, should_not_wait_but_waited, should_wait_but_did_not, should_not_wait_did_not_wait])
    
C_errors = pd.DataFrame(array_C_errors, columns=['C', 'should_not_wait_but_waited', 'should_wait_but_did_not', 'should_not_wait_did_not_wait'])
C_errors = C_errors.set_index('C')




In [17]:
df = C_errors
traces = [
    go.Scatter(x = df.index, y = df.loc[:, column], mode = 'markers+lines', name = column) 
    for column in df.columns
]
py.iplot(traces, filename='ml_queue_0803.sec2.fig2')   

Understand the two types of errors.

In [None]:
C = 32
threshold = 0.0008
exceptional_cases = [193, 128]

for i in tqdm_notebook(valid_indices):
    
    if i in exceptional_cases:
        continue
    
    if should_not_wait(i, C) != did_not_wait(i, threshold):
        S, Q = SQ(i)
        break

In [None]:
should_not_wait(i, C)

In [None]:
did_not_wait(i, threshold)

In [None]:
jobs.loc[i]

In [None]:
S

In [None]:
Q

In [None]:
S.nodes.sum() + Q.nodes.sum()

From what I've seen above, most exceptions are confounding - in particular not scancelled - so this is good enough.

# Trash

In [None]:
def read_log():
    jobs = pd.read_csv(filepath_or_buffer='ml_queue.nanaimo.20170101-20180704.log', sep='\s+', header='infer', skiprows=[1], na_values=['UNLIMITED','Unknown']).dropna()

    jobs.loc[:, 'Submit'] = pd.to_datetime(jobs.loc[:, 'Submit'].copy(), errors='coerce')
    jobs.loc[:, 'Start'] = pd.to_datetime(jobs.loc[:, 'Start'].copy(), errors='coerce')
    jobs.loc[:, 'End'] = pd.to_datetime(jobs.loc[:, 'End'].copy(), errors='coerce')

    jobs.loc[:, 'NNodes'] = pd.to_numeric(jobs.loc[:, 'NNodes'].copy(), errors='coerce', downcast='integer')

    jobs.loc[:, 'Timelimit'] = jobs.loc[:, 'Timelimit'].copy().str.replace('-','day ')
    jobs.loc[:, 'Timelimit'] = pd.to_timedelta(jobs.loc[:, 'Timelimit'].copy(), errors='coerce')
    
    jobs.loc[:, 'Waited'] = (jobs.Start - jobs.Submit).values / pd.Timedelta('1h')

    jobs = jobs.dropna()
    
    return jobs

In [None]:
jobs = read_log()

In [None]:
def sample(tuple_):
    
    jobs, index, row = tuple_
    
    tj = thisjob = row
    now = tj.Submit
    
    rj = relatedjobs = jobs.loc[np.logical_and.reduce([jobs.index != index, jobs.Submit <= now, jobs.End > now])] # excludes thisjob, includes jobs submitted simultaneously but ranked earlier
    
    # 无关人等滚开
    if now < jobs.End.min() or index < 100:
        return None
        
    # 零、有空位，就不用等。
    ## 不必掐头，这样结果看起来好些
    # if rj.NNodes.sum() + tj.NNodes <= 36:
    #     # 几乎总是tj.Waited < 0.005
    #     return None
        
    # 一、各种descriptor
    analysis = pd.DataFrame([], index=[index])
    
    analysis.loc[:, 'NNodes'] = tj.NNodes
    analysis.loc[:, 'ETA']    = tj.Timelimit
    
    analysis.loc[:, 'Total_squeue_nnodes'] = rj.NNodes.sum()
    
    analysis.loc[:, 'Total_squeue_eta']    = rj.Timelimit.sum()
    
    analysis.loc[:, 'Self_job_in_squeue_percentage'] = 0 if len(rj)==0 else float(np.sum(rj.User == tj.User)) / len(rj)
    
    analysis.loc[:, 'Waited'] = tj.Waited
    
    return analysis
    
    
L = multiprocessing.Pool(processes=20).map(sample, [(jobs, index, row.copy()) for index, row in tqdm(jobs.iterrows(), total=len(jobs))])
analyses = pd.concat(L) # None is automatically ignored

In [None]:
# 去尾
analyses = analyses[analyses.Waited < analyses.Waited.quantile(q=0.97)].copy()

In [None]:
normalize_analyses = pd.DataFrame(
    sklearn.preprocessing.scale(
        analyses.apply(pd.to_numeric).values
    ), 
    index=analyses.index, 
    columns=analyses.columns
)


In [None]:
minibatch_size = 256
n_epochs = 256
ns_units = [25, 25]
lr = 1E-3

# Graph
tf.reset_default_graph()

X = tf.placeholder(name="X", dtype=tf.float32, shape=[None, 5])

h = X
for n_units in ns_units:
    h = tf.layers.dense(h, units=n_units, activation=tf.nn.elu)
h = tf.layers.dense(h, units=1, activation=None)
yhat = h

y = tf.placeholder(name="y", dtype=tf.float32, shape=[None, 1])

loss = tf.reduce_mean(tf.square(yhat - y), keepdims=False)
train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Session
sess = tf.Session()

sess.run(tf.global_variables_initializer())

m = Minibatch(normalize_analyses, minibatch_size, n_epochs, train_index=range(10000), test_index=range(10000, 20000))
b = BetterYhatLive(smoothen=False)
while True:
    try:
        _X, _y = m.minibatch()
        sess.run(train_op, feed_dict = {X: _X, y: _y})
        
        if m.i_minibatch % 20 == 0:
            X_train, y_train = m.train_set()
            yhat_train = sess.run(yhat, feed_dict = {X: X_train})
            X_test, y_test = m.test_set()
            yhat_test = sess.run(yhat, feed_dict = {X: X_test}) 
            b.update(m.i_epoch, y_train, yhat_train, y_test, yhat_test)
    except StopIteration:
        break

sess.close()

In [None]:
l = LambdaGraph()
l.add(b, label='25-25 $\\mu$')
l.add(b, label='25-25 $\\xi$')
l.draw()

In [None]:
C = 36

valid_indices = [_ for _ in jobs.index if _>100]

# for i in tqdm_notebook(valid_indices):
#     jobs.loc[i, 'should_not_wait'] = should_not_wait(i, C)
#     jobs.loc[i, 'did_not_wait'] = did_not_wait(i)

array_should_not_wait = multiprocessing.Pool(processes=20).map(
    functools.partial(should_not_wait, C=C),
    valid_indices
)
jobs.loc[valid_indices, 'should_not_wait'] = array_should_not_wait

jobs.loc[valid_indices, 'did_not_wait'] = jobs.loc[valid_indices, 'wait'] < 0.005

In [None]:
jobs.loc[101:110]

In [None]:
(jobs.loc[valid_indices, 'should_not_wait'] & ~jobs.loc[valid_indices, 'did_not_wait']).sum()

In [None]:
(~jobs.loc[valid_indices, 'should_not_wait'] & jobs.loc[valid_indices, 'did_not_wait']).sum()

In [None]:
(jobs.loc[valid_indices, 'should_not_wait'] & jobs.loc[valid_indices, 'did_not_wait']).sum()