In [98]:
# Numpy
import numpy as np
import pandas as pd
%matplotlib nbagg
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.patches as patches
import scipy.stats, scipy.interpolate, scipy.spatial

# Machine learning
import tensorflow as tf
import keras
import sklearn
import sklearn.preprocessing, sklearn.base, sklearn.utils, sklearn.model_selection, sklearn.gaussian_process, sklearn.linear_model
import optunity
import statsmodels.nonparametric.smoothers_lowess

# Various Python tricks and libraries
import re
import requests
import time
import functools
import operator
import collections
from tqdm import tqdm, tqdm_notebook, tnrange
import dill as pickle
import IPython
import gc
import json

# Parallel
import joblib
import multiprocessing

# ML可视化

In [26]:
def read_log():
    jobs = pd.read_csv(filepath_or_buffer='ml_queue.nanaimo.20170101-20180704.log', sep='\s+', header='infer', skiprows=[1], na_values=['UNLIMITED','Unknown']).dropna()

    jobs.loc[:, 'Submit'] = pd.to_datetime(jobs.loc[:, 'Submit'].copy(), errors='coerce')
    jobs.loc[:, 'Start'] = pd.to_datetime(jobs.loc[:, 'Start'].copy(), errors='coerce')
    jobs.loc[:, 'End'] = pd.to_datetime(jobs.loc[:, 'End'].copy(), errors='coerce')

    jobs.loc[:, 'NNodes'] = pd.to_numeric(jobs.loc[:, 'NNodes'].copy(), errors='coerce', downcast='integer')

    jobs.loc[:, 'Timelimit'] = jobs.loc[:, 'Timelimit'].copy().str.replace('-','day ')
    jobs.loc[:, 'Timelimit'] = pd.to_timedelta(jobs.loc[:, 'Timelimit'].copy(), errors='coerce')
    
    jobs.loc[:, 'Waited'] = (jobs.Start - jobs.Submit).values / pd.Timedelta('1h')

    # jobs = jobs[(np.abs(scipy.stats.zscore(jobs.loc[:,'Wait'].values / pd.Timedelta('1h'))) <3)]
    jobs = jobs.dropna()
    
    # jobs = jobs.sort_values(by = 'Submit')
    # 几乎总是sorted
    
    return jobs

In [27]:
jobs = read_log()

In [28]:
def sample(tuple_):
    
    jobs, index, row = tuple_
    
    tj = thisjob = row
    now = tj.Submit
    
    rj = relatedjobs = jobs.loc[np.logical_and.reduce([jobs.index != index, jobs.Submit <= now, jobs.End > now])] # excludes thisjob, includes jobs submitted simultaneously but ranked earlier
    
    # 无关人等滚开
    if now < jobs.End.min() or index < 100:
        return None
        
    # 零、有空位，就不用等。
    ## 不必掐头，这样结果看起来好些xD
    # if rj.NNodes.sum() + tj.NNodes <= 36:
    #     # 几乎总是tj.Waited < 0.005
    #     return None
        
    # 一、各种descriptor
    analysis = pd.DataFrame([], index=[index])
    
    analysis.loc[:, 'NNodes'] = tj.NNodes
    analysis.loc[:, 'ETA']    = tj.Timelimit
    
    analysis.loc[:, 'Total_squeue_nnodes'] = rj.NNodes.sum()
    
    analysis.loc[:, 'Total_squeue_eta']    = rj.Timelimit.sum()
    
    analysis.loc[:, 'Self_job_in_squeue_percentage'] = 0 if len(rj)==0 else float(np.sum(rj.User == tj.User)) / len(rj)
    
    analysis.loc[:, 'Waited'] = tj.Waited
    
    return analysis
    
    
L = multiprocessing.Pool(processes=20).map(sample, [(jobs, index, row.copy()) for index, row in tqdm(jobs.iterrows(), total=len(jobs))])
analyses = pd.concat(L) # None is automatically ignored

100%|██████████| 24290/24290 [00:04<00:00, 5646.07it/s]
Process PoolWorker-20:
Process PoolWorker-10:
Process PoolWorker-13:
Process PoolWorker-9:
Process PoolWorker-16:
Process PoolWorker-14:
Process PoolWorker-11:
Process PoolWorker-3:
Traceback (most recent call last):
Process PoolWorker-7:
Process PoolWorker-18:
Process PoolWorker-5:
Process PoolWorker-2:
Process PoolWorker-1:
Traceback (most recent call last):
Process PoolWorker-17:
Process PoolWorker-6:
Traceback (most recent call last):
Process PoolWorker-4:
Traceback (most recent call last):
Process PoolWorker-19:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process PoolWorker-12:
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Traceback (most recent call last):
Process PoolWorker-15:
Traceback (most recent call last):
Process PoolWorker-8:
Traceback (most recent call last):
  File "/usr/lib/python2.7/m

  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 378, in get
    racquire()
    racquire()
    racquire()
    racquire()
    racquire()
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
    racquire()
    racquire()
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
KeyboardInterrupt
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
    racquire()
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
KeyboardInterrupt
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
    racquire()
    racquire()
KeyboardInterrupt
    return recv()
KeyboardInterrupt
    racquire()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
    racquire()
KeyboardInterrupt
    racquire()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
    racquire()
KeyboardInterrupt
KeyboardInterrupt
    racquire()
K

In [29]:
# 去尾
analyses = analyses[analyses.Waited < analyses.Waited.quantile(q=0.97)].copy()

In [30]:
normalize_analyses = pd.DataFrame(
    sklearn.preprocessing.scale(
        analyses.apply(pd.to_numeric).values
    ), 
    index=analyses.index, 
    columns=analyses.columns
)




In [85]:
class Minibatch(object):
    '''
    Makes batches from dataframes.
    Executes n_epochs before raising StopIteration.
    Allows manually setting aside a test set, or automatically randomly selecting one.
    Progress bar.
    Note: "train set" is preferred to "training set", e.g. train_index, X_train
    '''
    
    '''
    Variables:
    n_minibatches, i_minibatch
    n_epochs, @property i_epoch
    '''
    
    
    def __init__(self, df, minibatch_size, n_epochs, train_index=None, test_index=None, train_split=None, test_split=None, tqdm=True):
        self.minibatch_size = minibatch_size
        self.n_epochs = n_epochs
        N = len(df)
        all_index = range(N)
        
        # determine train_index and test_index
        # given index
        if train_index and test_index:
            pass
        elif train_index and not test_index:
            test_index = list(set(all_index) - set(train_index))
        elif test_index and not train_index:
            train_index = list(set(all_index) - set(test_index))
        # given split percentage
        elif train_split and test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            remaining_index = list(set(all_index) - set(train_index))
            test_index = np.random.choice(remaining_index, int(N * test_split), replace=False)
        elif train_split and not test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            test_index = list(set(all_index) - set(train_index))
        elif test_split and not train_split:
            test_index = np.random.choice(all_index, int(N * test_split), replace=False)   
            train_index = list(set(all_index) - set(test_index))
        else:
            raise Exception("Either specify index, or specify split.")
                            
        # generate train_df
        self.train_df = df.iloc[train_index]
        self.test_df = df.iloc[test_index]
        self.df = df

        # minibatch counter
        self.i_minibatch = 0
        self.n_minibatches = n_epochs * len(self.train_df) / minibatch_size
        
        if tqdm:
            self.tqdm = tqdm_notebook(total=self.n_minibatches, leave=False)
        
    def minibatch(self):
        if self.i_minibatch > self.n_minibatches:
            self.i_minibatch = 0
            raise StopIteration
        else:
            self.i_minibatch += 1
            
        if getattr(self, 'tqdm', None):
            self.tqdm.update(1)
        
        index = np.random.choice(range(len(self.train_df)), self.minibatch_size, replace=False)
        return self.train_df.iloc[index, :-1].values, self.train_df.iloc[index, -1].values.reshape(-1, 1)
    
    def train_set(self):
        return self.train_df.iloc[:, :-1].values, self.train_df.iloc[:, -1].values.reshape(-1, 1)
    
    def test_set(self):
        return self.test_df.iloc[:, :-1].values, self.test_df.iloc[:, -1].values.reshape(-1, 1)
    
    @property
    def i_epoch(self):
        # the number of epochs
        return float(self.i_minibatch) * self.n_epochs / self.n_minibatches

In [86]:
class BetterYhatLive(object):
    '''
    Plot (i_epoch, r2).
    Plot (y, yhat).
    '''
    
    def __init__(self, smoothen):
        self.fig, (self.ax_decay, self.ax_corr) = plt.subplots(1, 2, figsize=(14, 4.2))
        self.ax_corr.set_aspect('equal', adjustable='datalim')
        self.i_epochs, self.line_decay_train, self.line_decay_test = [], [], []
        self.smoothen = smoothen
        
    def update(self, i_epoch, y_train, yhat_train, y_test, yhat_test):
        self.i_epochs.append(i_epoch)
        self.line_decay_train.append(sklearn.metrics.r2_score(y_train, yhat_train))
        self.line_decay_test.append(sklearn.metrics.r2_score(y_test, yhat_test))
        # smoothen
        if self.smoothen:
            x = range(len(self.line_decay_train))
            smoothline_decay_train = statsmodels.nonparametric.smoothers_lowess.lowess(self.line_decay_train, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
            smoothline_decay_test = statsmodels.nonparametric.smoothers_lowess.lowess(self.line_decay_test, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
        else:
            smoothline_decay_train = self.line_decay_train
            smoothline_decay_test = self.line_decay_test
        #
        label_train = '$r^2_{train}$=%.2f'%self.line_decay_train[-1]
        label_test = '$r^2_{test}$=%.2f'%self.line_decay_test[-1]
        #
        self.ax_decay.clear()
        self.ax_decay.plot(self.i_epochs, smoothline_decay_train, label=label_train)
        self.ax_decay.plot(self.i_epochs, smoothline_decay_test, label=label_test)
        self.ax_decay.legend(loc='best')
        #
        self.ax_corr.clear()
        self.ax_corr.set_aspect('equal', adjustable='datalim')
        self.ax_corr.scatter(y_train, yhat_train, color='green', s=1, alpha=0.2, label='train')
        self.ax_corr.scatter(y_test, yhat_test, color='red', s=1, alpha=0.2, label='test')
        self.ax_corr.legend(loc='best')
        #
        self.fig.canvas.draw()

In [87]:
import colorsys

def RGBToHex(r, g, b):
    return '#%02x%02x%02x' % (r, g, b)
 
def HSVToRGB(h, s, v):
    (r, g, b) = colorsys.hsv_to_rgb(h, s, v)
    return RGBToHex(int(255*r), int(255*g), int(255*b))
 
def getDistinctColors(n):
    huePartition = 1.0 / (n + 1)
    return list(HSVToRGB(huePartition * value, 1.0, 1.0) for value in range(0, n))

In [100]:
minibatch_size = 256
n_epochs = 256
ns_units = [25, 25]
lr = 1E-3

# Graph
tf.reset_default_graph()

X = tf.placeholder(name="X", dtype=tf.float32, shape=[None, 5])

h = X
for n_units in ns_units:
    h = tf.layers.dense(h, units=n_units, activation=tf.nn.elu)
h = tf.layers.dense(h, units=1, activation=None)
yhat = h

y = tf.placeholder(name="y", dtype=tf.float32, shape=[None, 1])

loss = tf.reduce_mean(tf.square(yhat - y), keepdims=False)
train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Session
sess = tf.Session()

sess.run(tf.global_variables_initializer())

m = Minibatch(normalize_analyses, minibatch_size, n_epochs, train_index=range(10000), test_index=range(10000, 20000))
b = BetterYhatLive(smoothen=False)
l = LambdaGraph(m, b, system="waittime_propto_queuelen2")
while True:
    try:
        _X, _y = m.minibatch()
        sess.run(train_op, feed_dict = {X: _X, y: _y})
        
        if m.i_minibatch % 20 == 0:
            X_train, y_train = m.train_set()
            yhat_train = sess.run(yhat, feed_dict = {X: X_train})
            X_test, y_test = m.test_set()
            yhat_test = sess.run(yhat, feed_dict = {X: X_test}) 
            b.update(m.i_epoch, y_train, yhat_train, y_test, yhat_test)
    except StopIteration:
        break

sess.close()

<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [105]:
l.update(run="[128, 64, 32], elu, Adam, lr=0.001, minibatch=1024, epochs=6400, repeat 2")

<IPython.core.display.Javascript object>

Indicate which points to save by entering x or (x,y). Enter None to exit. Enter Auto-5 to perform 5-point analysis.
Point: Auto-5
Recorded: r2_train 0.126, r2_test 0.090
Recorded: r2_train 0.205, r2_test 0.190
Recorded: r2_train 0.261, r2_test 0.204
Recorded: r2_train 0.366, r2_test 0.125
Recorded: r2_train 0.314, r2_test 0.162
Point: None
Finished.


In [106]:
l.finish()

# Trash

In [99]:
# old LambdaGraph

class LambdaGraph(object):
    '''
    init()：
    Minibatch m, BetterYhatLive y, system name
    --
    update():
    Plots (P1, P2).
    Inputs which points to save, finds closest point, reports, saves to txt.
    --
    finish():
    Saves to txt and json (for ogma).
    '''
    
    def __init__(self, m, y, system):
        self.m = m
        self.y = y
        self.system = system
        
    def update(self, run):
        
        # Init Data
        es = i_epochs = np.array(self.y.i_epochs)
        xs = line_decay_train = np.array(self.y.line_decay_train)
        ys = line_decay_test = np.array(self.y.line_decay_test)
        mask = (xs>=0) & (xs<=1) & (ys>=0) & (ys<=1)
        es = es[mask]
        xs = xs[mask]
        ys = ys[mask]
        coors = np.array(zip(xs, ys))
        tree = scipy.spatial.KDTree(coors)
        
        # Plots (P1, P2). Equal aspect. Grid.
        fig, ax = plt.subplots()
        ax.scatter(xs, ys, alpha=0.5)
        ax.set_aspect('equal', adjustable='datalim')
        start, end = ax.get_xlim()
        ax.xaxis.set_ticks(np.arange(start, end, (end-start)/10.))
        ax.grid()
        fig.canvas.draw()
        
        # Inputs which points to save
        def save(i):
            with open("%s.LambdaGraph.txt"%self.system, "a") as f:
                f.write("%s | %s | %.3f | %.3f\n" %(run, es[i], xs[i], ys[i]))
            print "Recorded: r2_train %.3f, r2_test %.3f" %(xs[i], ys[i])
        print 'Indicate which points to save by entering x or (x,y). Enter None to exit. Enter Auto-5 to perform 5-point analysis.'
        while True:
            inp = raw_input('Point: ')
            if inp == 'None': # exit
                break
            elif inp == 'Auto-5': # 5-point analysis
                head = 0
                tail = len(xs) - 1
                imax = ys.argmax()
                headhalf = np.abs(xs - (xs[head]+xs[imax])/2).argmin()
                tailhalf = np.abs(xs - (xs[tail]+xs[imax])/2).argmin()
                if np.abs(ys[head] - ys[imax]) < 0.01:
                    head = headhalf = imax
                if np.abs(ys[tail] - ys[imax]) < 0.01:
                    tail = tailhalf = imax
                for i in list({head, headhalf, imax, tailhalf, tail}):
                    save(i)
            elif '(' in inp: # tuple
                # find closest point
                x, y = eval(inp)
                i = tree.query([x, y])[1]
                # reports, saves to txt.
                save(i)
            else: # float
                # find closest point
                x = float(inp)
                i = np.abs(xs - x).argmin()
                # reports, saves to txt.
                save(i)
        print 'Finished.'
        
    def finish(self):
        
        # Saves to txt and json (for ogma).
        with open("%s.LambdaGraph.txt"%self.system, "r") as f:
            lines = [_.strip() for _ in f.readlines() if len(_.strip())]
        jdict = {
            'nodes': [
            ]
        }
        
        # nodes[] = [label, e, x, y]. need to color code labels.
        nodes = []
        labels = []
        for line in lines:
            label, e, x, y = re.split('\s+\|\s+', line)
            x = float(x)
            y = float(y)
            nodes.append([label, e, x, y])
            labels.append(label)
            
        # color code labels
        le = sklearn.preprocessing.LabelEncoder()
        transformed_labels = le.fit_transform(labels)
        colors = getDistinctColors(len(le.classes_))
        
        # creates nodes
        for i, node in enumerate(nodes):
            color = colors[transformed_labels[i]]
            label, e, x, y = node
            jdict['nodes'].append({
                'attributes':{
                    'x': x,
                    'y': y,
                    'text': "%s - %s" %(label, int(e)),
                    'icon': {
                        'color': color
                    }
                }
            })
        
        # save data
        with open("%s.LambdaGraph.json"%self.system, "w") as f:
            json.dump(jdict, f)