Get the data.

In [1]:
# Numpy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import scipy.stats

# Machine learning
import os
import tensorflow as tf
import keras
import sklearn
import sklearn.preprocessing, sklearn.base, sklearn.utils, sklearn.model_selection, sklearn.gaussian_process

import optunity

# Various Python tricks and libraries
import requests
import time
import functools
import operator
import collections
from tqdm import tqdm, tqdm_notebook, tnrange
import dill as pickle
import IPython
import gc
import math

# Parallel
import joblib
import multiprocessing

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Imports.

In [None]:
def r2_score(ytrue, ypred): # sklearn.metrics.r2_score in tensorflow. 1 output only. 

    ytrue_mean = tf.reduce_mean(ytrue, name="ytrue_mean")
    r2_score = tf.subtract(1., tf.truediv(tf.reduce_mean((ytrue - ypred) ** 2), tf.reduce_mean((ytrue - ytrue_mean) ** 2)), name="r2_score")
    return r2_score

class TqdmProgBar(keras.callbacks.Callback):
    '''features:
    1. tqdm ETA bar
    2. logs[field] plotted for each field in fields
    '''
    
    def __init__(self, n_epochs, fields, interval=10): 
        self.n_epochs = n_epochs
        self.fields = fields
        self.interval = interval
        
        self.fields_history = dict((field, []) for field in fields)
        self.fig, self.ax = plt.subplots(1, 1)
        
    def on_train_begin(self, logs):
        self.pbar = tqdm_notebook(total=self.n_epochs, leave=False)
        
    def on_train_end(self, logs):
        self.pbar.close()
        
    def on_epoch_end(self, epoch, logs):
        if epoch % self.interval == 0:
            self.pbar.update(self.interval)

            for field in self.fields:
                self.fields_history[field].append(logs[field])

            self.ax.clear()
            for field in self.fields:
                self.ax.plot(self.fields_history[field], label="%s=%.2f" %(field, self.fields_history[field][-1]))
            self.ax.legend(loc='best')
            self.fig.canvas.draw()

In [None]:
def list_to_csline(l):
    '''converts a list to a comma-separated line'''
    return ',\t'.join(map(str, l)) + '\n'

In [None]:
class Minibatch(object):
    '''
    Makes batches from dataframes.
    Executes n_epochs before raising StopIteration and dying.
    Allows setting aside a test set.
    Progress bar.
    '''
    
    def __init__(self, df, minibatch_size, n_epochs, test_split, tqdm=None):
        self.minibatch_size = minibatch_size
        
        N = len(df)
        test_size = int(N * test_split)
        test_index = np.random.choice(N, test_size, replace=False)
        training_index = list(set(range(N)) - set(test_index))
        self.test_df = df.iloc[test_index]
        self.training_df = df.iloc[training_index]
        self.df = df

        self.i = 0
        self.iMAX = n_epochs * len(self.training_df) / minibatch_size
        
        if tqdm:
            self.tqdm = tqdm_notebook(total=self.iMAX, leave=False)
        
    def minibatch(self):
        if self.i > self.iMAX:
            self.i = 0
            raise StopIteration
        else:
            self.i += 1
            
        if getattr(self, 'tqdm', None):
            self.tqdm.update(1)
        
        index = np.random.choice(range(len(self.training_df)), self.minibatch_size, replace=False)
        return self.training_df.iloc[index, :-1].values, self.training_df.iloc[index, -1].values.reshape(-1, 1)
    
    def training_set(self):
        return self.training_df.iloc[:, :-1].values, self.training_df.iloc[:, -1].values.reshape(-1, 1)
    
    def test_set(self):
        return self.test_df.iloc[:, :-1].values, self.test_df.iloc[:, -1].values.reshape(-1, 1)

Get the data.

In [None]:
def read_log():
    jobs = pd.read_csv(filepath_or_buffer='ml_queue.log', sep='\s+', header='infer', skiprows=[1], na_values=['UNLIMITED','Unknown','kijana','root']).dropna()

    jobs.loc[:, 'Submit'] = pd.to_datetime(jobs.loc[:, 'Submit'].copy(), errors='coerce')
    jobs.loc[:, 'Start'] = pd.to_datetime(jobs.loc[:, 'Start'].copy(), errors='coerce')
    jobs.loc[:, 'End'] = pd.to_datetime(jobs.loc[:, 'End'].copy(), errors='coerce')

    jobs.loc[:, 'NNodes'] = pd.to_numeric(jobs.loc[:, 'NNodes'].copy(), errors='coerce', downcast='integer')

    jobs.loc[:, 'Timelimit'] = jobs.loc[:, 'Timelimit'].copy().str.replace('-','day ')
    jobs.loc[:, 'Timelimit'] = pd.to_timedelta(jobs.loc[:, 'Timelimit'].copy(), errors='coerce')
    
    jobs.loc[:, 'Waited'] = (jobs.Start - jobs.Submit).values / pd.Timedelta('1h')

#     jobs = jobs[(np.abs(scipy.stats.zscore(jobs.loc[:,'Wait'].values / pd.Timedelta('1h'))) <3)]
    jobs = jobs.dropna()
    
    jobs = jobs.sort_values(by = 'Submit')
    
    return jobs

In [None]:
jobs = read_log()

In [None]:
def sample(tuple_):
    
    jobs, index, row = tuple_
    
    tj = thisjob = row
    now = tj.Submit
    
    rj = relatedjobs = jobs.loc[np.logical_and.reduce([jobs.index != index, jobs.Submit <= now, jobs.End > now])] # excludes thisjob, includes jobs submitted simultaneously but ranked earlier
    
    # 无关人等滚开
    if now < jobs.End.min() or index < 100 or index == 10826:
        return None
        
    # 零、有空位，就不用等。
    if rj.NNodes.sum() + tj.NNodes <= 29 and np.sum(rj.User == tj.User)<10 and rj.loc[rj.User == tj.User].NNodes.sum() + tj.NNodes<16:
        assert tj.Waited <= 0.3
        
    # 一、各种descriptor
    analysis = pd.DataFrame([], index=[index])
    
    analysis.loc[:, 'NNodes'] = tj.NNodes
    analysis.loc[:, 'ETA']    = tj.Timelimit
    
    analysis.loc[:, 'Total_squeue_nnodes'] = rj.NNodes.sum()
    
    analysis.loc[:, 'Total_squeue_eta']    = rj.Timelimit.sum()
    
    analysis.loc[:, 'Self_job_in_squeue_percentage'] = 0 if len(rj)==0 else float(np.sum(rj.User == tj.User)) / len(rj)
    
    analysis.loc[:, 'Waited'] = tj.Waited
    
    return analysis
    
    
L = multiprocessing.Pool(processes=20).map(sample, [(jobs, index, row.copy()) for index, row in jobs.iterrows()])
analyses = pd.concat(L) # None is automatically ignored

In [None]:
normalize_analyses = pd.DataFrame(sklearn.preprocessing.scale(analyses.apply(pd.to_numeric).values), index=analyses.index, columns=analyses.columns)

Train.

In [None]:
def try_regress(activation, dropout, momentum, log2_minibatch_size, n_epochs, minuslog10_learning_rate, optimizer, n_layers,
               units1, units2=None, units3=None, units4=None, units5=None, units6=None,
               normalize_analyses=normalize_analyses):
    
    minibatch_size = int(2. ** log2_minibatch_size)
    n_epochs = int(n_epochs)
    learning_rate = 10. ** -minuslog10_learning_rate
    n_layers = int(n_layers)
    
    # graph
    tf.reset_default_graph()
    training = tf.placeholder(name="training", dtype=tf.bool)
    h = X = tf.placeholder(name="X", dtype=tf.float32, shape=[None, 5])
    y = tf.placeholder(name="y", dtype=tf.float32, shape=[None, 1])
    
    for units in [units1, units2, units3, units4, units5, units6]:
        if units:
            units = int(units)
            h = tf.layers.dense(h, units)
            h = tf.layers.batch_normalization(h, momentum=momentum, training=training)
            h = getattr(tf.nn, activation)(h)
            h = tf.layers.dropout(h, dropout, training=training)
            
    yhat = tf.layers.dense(h, units=1, name="yhat")
    
    loss = tf.reduce_mean(tf.square(yhat - y), keepdims=False)
    training_op = getattr(tf.train, optimizer)(learning_rate).minimize(loss)
    
    # sess
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    m = Minibatch(normalize_analyses, minibatch_size, n_epochs, test_split=0.2)
    while True:
        try:
            _X, _y = m.minibatch()
            sess.run(training_op, feed_dict={X: _X, y: _y, training: True})
        except StopIteration:
            break
            
    # evaluate on training set
    _X, _y = m.training_set()
    _yhat = sess.run(yhat, feed_dict={X: _X, y: _y, training: False})
    training_r2 = -10 if np.isnan(_yhat).any() else sklearn.metrics.r2_score(_y.reshape(-1), _yhat.reshape(-1))
    
    # evaluate on test set
    _X, _y = m.test_set()
    _yhat = sess.run(yhat, feed_dict={X: _X, y: _y, training: False})
    test_r2 = -10 if np.isnan(_yhat).any() else sklearn.metrics.r2_score(_y.reshape(-1), _yhat.reshape(-1))
    
    sess.close()
    
    # dump output
    with open('pso.csv', 'a') as f:
        f.write(list_to_csline([time.strftime("%Y-%m-%d %H:%M:%S"), activation, dropout, momentum, minibatch_size, n_epochs, learning_rate, optimizer, n_layers, units1, units2, units3, units4, units5, units6, training_r2, test_r2]))
    
    return test_r2

Optunity optimize.

In [None]:
search_space = {
    'activation': { # tf.nn.X
        'tanh': None,
        'elu': None,
        'relu': None
    },
    'dropout': [0, 1],
    'momentum': [0.5, 1],
    'log2_minibatch_size': [7, 13],
    'n_epochs': [200, 400],
    'minuslog10_learning_rate': [0, 5],
    'optimizer': { # tf.train.X
        'AdamOptimizer': None,
        'GradientDescentOptimizer': None,
        'RMSPropOptimizer': None
    }
}

max_units = 16
max_layers = 6
# we start from 1
search_space['n_layers'] = {
    str(n_layers): {
        'units'+str(layer): [1, max_units] 
        for layer in range(1, n_layers + 1) 
    } 
    for n_layers in range(1, max_layers + 1)
}

In [None]:
with open('pso.csv', 'w') as f:
    f.write(list_to_csline(['datetime', 'activation', 'dropout', 'momentum', 'minibatch_size', 'n_epochs', 'learning_rate', 'optimizer', 'n_layers', 'units1', 'units2', 'units3', 'units4', 'units5', 'units6', 'training_r2', 'test_r2']))
    
result = optunity.maximize_structured(try_regres, search_space=search_space, num_evals=10240)

with open('pso.result.pickle', 'wb') as f:
    pickle.dump(result, f)


是否限于步数未调试三层+？

In [None]:
max_units = 16
max_layers = 6

search_space_2 = {}
# we start from 1
search_space_2['n_layers'] = {
    str(n_layers): {
        'units'+str(layer): [1, max_units] 
        for layer in range(1, n_layers + 1) 
    } 
    for n_layers in range(1, max_layers + 1)
}

In [None]:
def try_regress_2(units1, units2=None, units3=None, units4=None, units5=None, units6=None, *args, **kwargs):
    
    L = [units1, units2, units3, units4, units5, units6]
    f = lambda x: (x - 12.)**2. if x else 0
    return sum(map(f, L))

In [None]:
result = optunity.maximize_structured(try_regress_2, search_space=search_space_2, num_evals=50)

“变量越少越好”?

组1

In [None]:
search1 = {
    'n_layers': { str(n_layers): {
        'units' + str(ilayer): [0, 16] for ilayer in range(n_layers)
    } for n_layers in range(1, 6+1) }
}

In [None]:
def f1(x):
    return (math.ceil(x) - 12.) ** 2. if x is not None else 0

def try_regress_1(n_layers, units0=None, units1=None, units2=None, units3=None, units4=None, units5=None):
    L = [units0, units1, units2, units3, units4, units5]
    
    return sum(map(f1, L))

In [None]:
L = []
for _ in range(20): # 20 trials average
    result = optunity.maximize_structured(try_regress_1, search_space=search1, num_evals=50)
    L.append(result[1].optimum)
print np.mean(L), np.std(L) / np.sqrt(10)

完毕。

组2

In [7]:
optunity = reload(optunity)
optunity.api = reload(optunity.api)

In [8]:
search2 = {
    'n_layers': { str(n_layers): {
        'units' + str(ilayer): {
            str(i): None for i in range(1, 3+1)
        } for ilayer in range(n_layers)
    } for n_layers in range(1, 3+1) }
}

In [9]:
pprint(search2)

Pretty printing has been turned ON


In [10]:
def f2(x):
    return (float(x) - 12.) ** 2. if x is not None else 0

def try_regress_2(units0=None, units1=None, units2=None, units3=None, units4=None, units5=None, *args, **kwargs):
    L = [units0, units1, units2, units3, units4, units5]
    
    return sum(map(f2, L))

In [None]:
L = []
for _ in range(20): # 20 trials average
    result = optunity.maximize_structured(try_regress_2, search_space=search2, num_evals=10)
    L.append(result[1].optimum)
print np.mean(L), np.std(L) / np.sqrt(10)

Python 2.7.12 (default, Dec  4 2017, 14:50:18) 
Type "copyright", "credits" or "license" for more information.

IPython 5.5.0 -- An enhanced Interactive Python.
?         -> Introduction and overview of IPython's features.
%quickref -> Quick reference.
help      -> Python's own help system.
object?   -> Details about 'object', use 'object??' for extra details.

In [1]: call_dict
Out[1]: 
{'args': {'': ['units0', 'units0', 'units1', 'units1', 'units2', 'units1'],
  '2': [None, None, None, None, None],
  '3': [None, None, None, None, None],
  'n_layers': ['1', '3', '2', '1', '3', '2', '3', '2'],
  'units0': ['2', '3', '1', '1', '3', '3', '2', '2'],
  'units1': [None, '2', '1', None, '2', '1', '1', '2'],
  'units2': [None, '2', None, None, '1', None, '1', None]},
 'values': [100.0, 281.0, 242.0, 121.0, 302.0, 202.0, 342.0, 200.0]}

In [2]: index
Out[2]: 6

In [3]: call_dict['args'].items()
Out[3]: 
[('units1', [None, '2', '1', None, '2', '1', '1', '2']),
 ('units0', ['2', '3', '1', '1', '

组3

In [None]:
def f2(x):
    return (float(x) - 12.) ** 2. if x is not None else 0

def try_regress_2(units0=None, units1=None, units2=None, units3=None, units4=None, units5=None):
    L = [units0, units1, units2, units3, units4, units5]
    
    return sum(map(f2, L))

In [None]:
max_units = 16
max_layers = 6

search_space_3 = {
    'units' + str(ilayer): [-max_units, max_units + 1] for ilayer in range(max_layers)
}

In [None]:
def try_regress_3(units0, units1, units2, units3, units4, units5):
    
    # 0及以下按0算
    L = [units0, units1, units2, units3, units4, units5]
    L = [int(round(_)) if int(round(_))>0 else 0 for _ in L]
    
    # 不允许中间空层
    sgn_L = np.sign(L)
    if (np.diff(sgn_L) > 0).any():
        return -10
    
    # 原值
    f = lambda x: (x - 12.)**2. if x!=0 else 0
    return sum(map(f, L))
        

In [None]:
result = optunity.maximize_structured(try_regress_3, search_space=search_space_3, num_evals=50)

我觉得可以i)跑particle swarm optimization，看趋势确定可以执行ii)greedy algorithm。