In [1]:
# Numpy
import numpy as np
import pandas as pd
%matplotlib nbagg
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.patches as patches
import scipy.stats
import scipy.interpolate

# Machine learning
import tensorflow as tf
import keras
import sklearn
import sklearn.preprocessing, sklearn.base, sklearn.utils, sklearn.model_selection, sklearn.gaussian_process, sklearn.linear_model
import optunity
import statsmodels.nonparametric.smoothers_lowess

# Various Python tricks and libraries
import requests
import time
import functools
import operator
import collections
from tqdm import tqdm, tqdm_notebook, tnrange
import dill as pickle
import IPython
import gc

# Parallel
import joblib
import multiprocessing

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# edison 20,000 reproducibility? or to say, is it consistently good?

In [3]:
store = pd.HDFStore('ml.queue.edison.180501.h5')
normalize_analyses = store['normalize_analyses']

In [4]:
len(normalize_analyses)

153555

In [22]:
r2_trains, r2_tests1, r2_tests2 = [], [], []

XRANGE = range(1, 130000, 500)
for cut in tqdm(XRANGE):

    train = normalize_analyses.iloc[:cut]
    test1 = normalize_analyses.iloc[cut:130000]
    test2 = normalize_analyses.iloc[cut:150000]
    X_train = train.drop('Waited', axis=1)
    y_train = train.Waited
    X_test1 = test1.drop('Waited', axis=1)
    y_test1 = test1.Waited
    X_test2 = test2.drop('Waited', axis=1)
    y_test2 = test2.Waited

    reg = sklearn.linear_model.LinearRegression()
    reg.fit(X_train, y_train)
    yhat_train = reg.predict(X_train)
    yhat_test1 = reg.predict(X_test1)
    yhat_test2 = reg.predict(X_test2)
    r2_trains.append(sklearn.metrics.r2_score(y_train, yhat_train))
    r2_tests1.append(sklearn.metrics.r2_score(y_test1, yhat_test1))
    r2_tests2.append(sklearn.metrics.r2_score(y_test2, yhat_test2))

fig, ax = plt.subplots()
ax.plot(XRANGE, r2_trains, '--', label=r'$r^2$ of train set: 0 - sample #', color='grey', )
ax.plot(XRANGE, r2_tests1, label=r'$r^2$ of test set 1: sample # - 130000', color='red')
ax.plot(XRANGE, r2_tests2, label=r'$r^2$ of test set 2: sample # - 150000', color='purple')
ax.legend()
ax.set_xlim(-5000, 153000)
ax.set_ylim(-2, 1.5)
ax.set_xlabel('sample # (job uid)')
ax.set_ylabel('$r^2$')
rect = patches.Rectangle((60000, 0.14), 50000, 0.20, edgecolor='green', facecolor='none')
ax.add_patch(rect)

100%|██████████| 260/260 [00:09<00:00, 27.36it/s]


<IPython.core.display.Javascript object>

<matplotlib.patches.Rectangle at 0x145a06e5f790>

# $\hat{wait}$与$\text{nodes}_N, \sum_i\text{nodes}_i, \text{time}_N, \sum_i\text{time}_i$线性相关

In [23]:
def read_log():
    jobs = pd.read_csv(filepath_or_buffer='ml_queue.nanaimo.20170101-20180704.log', sep='\s+', header='infer', skiprows=[1], na_values=['UNLIMITED','Unknown']).dropna()

    jobs.loc[:, 'Submit'] = pd.to_datetime(jobs.loc[:, 'Submit'].copy(), errors='coerce')
    jobs.loc[:, 'Start'] = pd.to_datetime(jobs.loc[:, 'Start'].copy(), errors='coerce')
    jobs.loc[:, 'End'] = pd.to_datetime(jobs.loc[:, 'End'].copy(), errors='coerce')

    jobs.loc[:, 'NNodes'] = pd.to_numeric(jobs.loc[:, 'NNodes'].copy(), errors='coerce', downcast='integer')

    jobs.loc[:, 'Timelimit'] = jobs.loc[:, 'Timelimit'].copy().str.replace('-','day ')
    jobs.loc[:, 'Timelimit'] = pd.to_timedelta(jobs.loc[:, 'Timelimit'].copy(), errors='coerce')
    
    jobs.loc[:, 'Waited'] = (jobs.Start - jobs.Submit).values / pd.Timedelta('1h')

    # jobs = jobs[(np.abs(scipy.stats.zscore(jobs.loc[:,'Wait'].values / pd.Timedelta('1h'))) <3)]
    jobs = jobs.dropna()
    
    # jobs = jobs.sort_values(by = 'Submit')
    # 几乎总是sorted
    
    return jobs

In [24]:
jobs = read_log()

In [30]:
def sample(tuple_):
    
    jobs, index, row = tuple_
    
    tj = thisjob = row
    now = tj.Submit
    
    rj = relatedjobs = jobs.loc[np.logical_and.reduce([jobs.index != index, jobs.Submit <= now, jobs.End > now])] # excludes thisjob, includes jobs submitted simultaneously but ranked earlier
    
    # 无关人等滚开
    if now < jobs.End.min() or index < 100:
        return None
        
    # 零、有空位，就不用等。
    # if rj.NNodes.sum() + tj.NNodes <= 36:
    #     # 几乎总是tj.Waited < 0.005
    #     return None
        
    # 一、各种descriptor
    analysis = pd.DataFrame([], index=[index])
    
    analysis.loc[:, 'NNodes'] = tj.NNodes
    analysis.loc[:, 'ETA']    = tj.Timelimit
    
    analysis.loc[:, 'Total_squeue_nnodes'] = rj.NNodes.sum()
    
    analysis.loc[:, 'Total_squeue_eta']    = rj.Timelimit.sum()
    
    analysis.loc[:, 'Self_job_in_squeue_percentage'] = 0 if len(rj)==0 else float(np.sum(rj.User == tj.User)) / len(rj)
    
    analysis.loc[:, 'Waited'] = tj.Waited
    
    return analysis
    
    
L = multiprocessing.Pool(processes=20).map(sample, [(jobs, index, row.copy()) for index, row in tqdm(jobs.iterrows(), total=len(jobs))])
analyses = pd.concat(L) # None is automatically ignored

100%|██████████| 24290/24290 [00:04<00:00, 5252.31it/s]
Process PoolWorker-88:
Process PoolWorker-94:
Process PoolWorker-82:
Process PoolWorker-83:
Traceback (most recent call last):
Process PoolWorker-84:
Process PoolWorker-81:
Process PoolWorker-93:
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Process PoolWorker-87:
Process PoolWorker-91:
Process PoolWorker-100:
Process PoolWorker-89:
Process PoolWorker-96:
Process PoolWorker-97:
Process PoolWorker-98:
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process PoolWorker-85:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Process PoolWorker-92:
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/usr/lib/python2.7/multiproce

  File "/usr/lib/python2.7/multiprocessing/queues.py", line 378, in get
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 102, in worker
    return recv()
    racquire()
    task = get()
    racquire()
KeyboardInterrupt
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
    racquire()
KeyboardInterrupt
    racquire()
    self._target(*self._args, **self._kwargs)
    racquire()
    task = get()
    racquire()
    task = get()
KeyboardInterrupt
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 102, in worker
KeyboardInterrupt
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
    task = get()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
  

In [31]:
# 去尾
analyses = analyses[analyses.Waited < analyses.Waited.quantile(q=0.97)].copy()

In [32]:
normalize_analyses = pd.DataFrame(
    sklearn.preprocessing.scale(
        analyses.apply(pd.to_numeric).values
    ), 
    index=analyses.index, 
    columns=analyses.columns
)




In [35]:
train = normalize_analyses.iloc[:10000]
test = normalize_analyses.iloc[10000:20000]
X_train = train.drop('Waited', axis=1)
y_train = train.Waited
X_test = test.drop('Waited', axis=1)
y_test = test.Waited

reg = sklearn.linear_model.LinearRegression()
reg.fit(X_train, y_train)
yhat_train = reg.predict(X_train)
yhat_test = reg.predict(X_test)
r2_train = sklearn.metrics.r2_score(y_train, yhat_train)
r2_test = sklearn.metrics.r2_score(y_test, yhat_test)
print 'r2_train %s, r2_test %s' %(r2_train, r2_test)

fig, ax = plt.subplots()
ax.scatter(y_train, yhat_train, alpha=0.5, label='train set', s=0.5, color='green')
ax.scatter(y_test,  yhat_test,  alpha=0.5, label='test set', s=0.5, color='red')
ax.legend()
ax.set_xlabel('Given by DFT (actual)')
ax.set_ylabel('Given by NN (fitted/predicted)')

r2_train 0.2254580216166545, r2_test 0.23007244381469727


<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x145a0135c110>

# $\text{E}\,\hat{\text{wait}}|_{\text{nodes}_N, \sum_i\text{nodes}_i, \text{time}_N, \sum_i\text{time}_i}$，MLP

In [41]:
def read_log():
    jobs = pd.read_csv(filepath_or_buffer='ml_queue.nanaimo.20170101-20180704.log', sep='\s+', header='infer', skiprows=[1], na_values=['UNLIMITED','Unknown']).dropna()

    jobs.loc[:, 'Submit'] = pd.to_datetime(jobs.loc[:, 'Submit'].copy(), errors='coerce')
    jobs.loc[:, 'Start'] = pd.to_datetime(jobs.loc[:, 'Start'].copy(), errors='coerce')
    jobs.loc[:, 'End'] = pd.to_datetime(jobs.loc[:, 'End'].copy(), errors='coerce')

    jobs.loc[:, 'NNodes'] = pd.to_numeric(jobs.loc[:, 'NNodes'].copy(), errors='coerce', downcast='integer')

    jobs.loc[:, 'Timelimit'] = jobs.loc[:, 'Timelimit'].copy().str.replace('-','day ')
    jobs.loc[:, 'Timelimit'] = pd.to_timedelta(jobs.loc[:, 'Timelimit'].copy(), errors='coerce')
    
    jobs.loc[:, 'Waited'] = (jobs.Start - jobs.Submit).values / pd.Timedelta('1h')

    # jobs = jobs[(np.abs(scipy.stats.zscore(jobs.loc[:,'Wait'].values / pd.Timedelta('1h'))) <3)]
    jobs = jobs.dropna()
    
    # jobs = jobs.sort_values(by = 'Submit')
    # 几乎总是sorted
    
    return jobs

In [42]:
jobs = read_log()

In [43]:
def sample(tuple_):
    
    jobs, index, row = tuple_
    
    tj = thisjob = row
    now = tj.Submit
    
    rj = relatedjobs = jobs.loc[np.logical_and.reduce([jobs.index != index, jobs.Submit <= now, jobs.End > now])] # excludes thisjob, includes jobs submitted simultaneously but ranked earlier
    
    # 无关人等滚开
    if now < jobs.End.min() or index < 100:
        return None
        
    # 零、有空位，就不用等。
    # if rj.NNodes.sum() + tj.NNodes <= 36:
    #     # 几乎总是tj.Waited < 0.005
    #     return None
        
    # 一、各种descriptor
    analysis = pd.DataFrame([], index=[index])
    
    analysis.loc[:, 'NNodes'] = tj.NNodes
    analysis.loc[:, 'ETA']    = tj.Timelimit
    
    analysis.loc[:, 'Total_squeue_nnodes'] = rj.NNodes.sum()
    
    analysis.loc[:, 'Total_squeue_eta']    = rj.Timelimit.sum()
    
    analysis.loc[:, 'Self_job_in_squeue_percentage'] = 0 if len(rj)==0 else float(np.sum(rj.User == tj.User)) / len(rj)
    
    analysis.loc[:, 'Waited'] = tj.Waited
    
    return analysis
    
    
L = multiprocessing.Pool(processes=20).map(sample, [(jobs, index, row.copy()) for index, row in tqdm(jobs.iterrows(), total=len(jobs))])
analyses = pd.concat(L) # None is automatically ignored

100%|██████████| 24290/24290 [00:05<00:00, 4756.01it/s]
Process PoolWorker-101:
Process PoolWorker-111:
Process PoolWorker-112:
Process PoolWorker-120:
Process PoolWorker-108:
Process PoolWorker-117:
Traceback (most recent call last):
Traceback (most recent call last):
Process PoolWorker-103:
Process PoolWorker-104:
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Process PoolWorker-114:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
    self.run()
    self.run()
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self.run()
Process PoolWorker-109:
  File "/usr/lib/python2.7/multiprocessing/process.py", 

  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
KeyboardInterrupt
    task = get()
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 102, in worker
    racquire()
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
    task = get()
    task = get()
    self._target(*self._args, **self._kwargs)
    racquire()
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
KeyboardInterrupt
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
    task = get()
    racquire()
KeyboardInterrupt
    racquire()
KeyboardInterrupt
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 102, in worker
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
KeyboardInterrupt
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 102, in worker
    racquire()
    task = get()
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 376, in get
    task = get()
  File "/usr/lib/python2.7/multiproc

In [44]:
# 去尾
analyses = analyses[analyses.Waited < analyses.Waited.quantile(q=0.97)].copy()

In [45]:
normalize_analyses = pd.DataFrame(
    sklearn.preprocessing.scale(
        analyses.apply(pd.to_numeric).values
    ), 
    index=analyses.index, 
    columns=analyses.columns
)




In [46]:
class BetterYhatLive(object):
    
    def __init__(self, smoothen):
        self.fig, (self.ax1, self.ax2) = plt.subplots(1, 2, figsize=(14, 4.2))
        self.ax2.set_aspect('equal', adjustable='datalim')
        self.line11, self.line12 = [], [] # note: line11 means ax1, line1
        self.smoothen = smoothen
        
    def update(self, y_train, yhat_train, y_test, yhat_test):
        self.line11.append(sklearn.metrics.r2_score(y_train, yhat_train))
        self.line12.append(sklearn.metrics.r2_score(y_test, yhat_test))
        # smoothen
        if self.smoothen:
            x = range(len(self.line11))
            smoothline11 = statsmodels.nonparametric.smoothers_lowess.lowess(self.line11, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
            smoothline12 = statsmodels.nonparametric.smoothers_lowess.lowess(self.line12, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
        else:
            smoothline11 = self.line11
            smoothline12 = self.line12
        #
        self.ax1.clear()
        self.ax1.plot(smoothline11, label='R2(train)=%.2f'%self.line11[-1])
        self.ax1.plot(smoothline12, label='R2(test)=%.2f'%self.line12[-1])
        self.ax1.legend(loc='best')
        self.ax2.clear()
        self.ax2.set_aspect('equal', adjustable='datalim')
        self.ax2.scatter(y_train, yhat_train, color='green', s=1, alpha=0.2, label='train')
        self.ax2.scatter(y_test, yhat_test, color='red', s=1, alpha=0.2, label='test')
        self.ax2.legend(loc='best')
        self.fig.canvas.draw()

In [50]:
class Minibatch(object):
    '''
    Makes batches from dataframes.
    Executes n_epochs before raising StopIteration.
    Allows manually setting aside a test set, or automatically randomly selecting one.
    Progress bar.
    Note: "train set" is preferred to "training set", e.g. train_index, X_train
    '''
    
    
    def __init__(self, df, minibatch_size, n_epochs, train_index=None, test_index=None, train_split=None, test_split=None, tqdm=True):
        self.minibatch_size = minibatch_size
        N = len(df)
        all_index = range(N)
        
        # determine train_index and test_index
        # given index
        if train_index and test_index:
            pass
        elif train_index and not test_index:
            test_index = list(set(all_index) - set(train_index))
        elif test_index and not train_index:
            train_index = list(set(all_index) - set(test_index))
        # given split percentage
        elif train_split and test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            remaining_index = list(set(all_index) - set(train_index))
            test_index = np.random.choice(remaining_index, int(N * test_split), replace=False)
        elif train_split and not test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            test_index = list(set(all_index) - set(train_index))
        elif test_split and not train_split:
            test_index = np.random.choice(all_index, int(N * test_split), replace=False)   
            train_index = list(set(all_index) - set(test_index))
        else:
            raise Exception("Either specify index, or specify split.")
                            
        # generate train_df
        self.train_df = df.iloc[train_index]
        self.test_df = df.iloc[test_index]
        self.df = df

        # minibatch counter
        self.i = 0
        self.iMAX = n_epochs * len(self.train_df) / minibatch_size
        
        if tqdm:
            self.tqdm = tqdm_notebook(total=self.iMAX, leave=False)
        
    def minibatch(self):
        if self.i > self.iMAX:
            self.i = 0
            raise StopIteration
        else:
            self.i += 1
            
        if getattr(self, 'tqdm', None):
            self.tqdm.update(1)
        
        index = np.random.choice(range(len(self.train_df)), self.minibatch_size, replace=False)
        return self.train_df.iloc[index, :-1].values, self.train_df.iloc[index, -1].values.reshape(-1, 1)
    
    def train_set(self):
        return self.train_df.iloc[:, :-1].values, self.train_df.iloc[:, -1].values.reshape(-1, 1)
    
    def test_set(self):
        return self.test_df.iloc[:, :-1].values, self.test_df.iloc[:, -1].values.reshape(-1, 1)

In [65]:
minibatch_size = 64
n_epochs = 64
ns_units = [5, 5]
lr = 1E-3

# Graph
tf.reset_default_graph()

X = tf.placeholder(name="X", dtype=tf.float32, shape=[None, 5])

h = X
for n_units in ns_units:
    h = tf.layers.dense(h, units=n_units, activation=tf.nn.elu)
h = tf.layers.dense(h, units=1, activation=None)
yhat = h

y = tf.placeholder(name="y", dtype=tf.float32, shape=[None, 1])

loss = tf.reduce_mean(tf.square(yhat - y), keepdims=False)
train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Session
sess = tf.Session()

sess.run(tf.global_variables_initializer())

m = Minibatch(normalize_analyses, minibatch_size, n_epochs, train_index=range(10000), test_index=range(10000, 20000))
b = BetterYhatLive(smoothen=False)
while True:
    try:
        _X, _y = m.minibatch()
        sess.run(train_op, feed_dict = {X: _X, y: _y})
        
        if m.i % 100 == 0:
            X_train, y_train = m.train_set()
            yhat_train = sess.run(yhat, feed_dict = {X: X_train})
            X_test, y_test = m.test_set()
            yhat_test = sess.run(yhat, feed_dict = {X: X_test}) 
            b.update(y_train, yhat_train, y_test, yhat_test)
    except StopIteration:
        break

sess.close()

<IPython.core.display.Javascript object>

KeyboardInterrupt: 

# 典型的$P(E_1)-P(E_2)$关系是什么？

In [4]:
def read_log():
    jobs = pd.read_csv(filepath_or_buffer='ml_queue.nanaimo.20170101-20180704.log', sep='\s+', header='infer', skiprows=[1], na_values=['UNLIMITED','Unknown']).dropna()

    jobs.loc[:, 'Submit'] = pd.to_datetime(jobs.loc[:, 'Submit'].copy(), errors='coerce')
    jobs.loc[:, 'Start'] = pd.to_datetime(jobs.loc[:, 'Start'].copy(), errors='coerce')
    jobs.loc[:, 'End'] = pd.to_datetime(jobs.loc[:, 'End'].copy(), errors='coerce')

    jobs.loc[:, 'NNodes'] = pd.to_numeric(jobs.loc[:, 'NNodes'].copy(), errors='coerce', downcast='integer')

    jobs.loc[:, 'Timelimit'] = jobs.loc[:, 'Timelimit'].copy().str.replace('-','day ')
    jobs.loc[:, 'Timelimit'] = pd.to_timedelta(jobs.loc[:, 'Timelimit'].copy(), errors='coerce')
    
    jobs.loc[:, 'Waited'] = (jobs.Start - jobs.Submit).values / pd.Timedelta('1h')

    # jobs = jobs[(np.abs(scipy.stats.zscore(jobs.loc[:,'Wait'].values / pd.Timedelta('1h'))) <3)]
    jobs = jobs.dropna()
    
    # jobs = jobs.sort_values(by = 'Submit')
    # 几乎总是sorted
    
    return jobs

In [5]:
jobs = read_log()

In [None]:
def sample(tuple_):
    
    jobs, index, row = tuple_
    
    tj = thisjob = row
    now = tj.Submit
    
    rj = relatedjobs = jobs.loc[np.logical_and.reduce([jobs.index != index, jobs.Submit <= now, jobs.End > now])] # excludes thisjob, includes jobs submitted simultaneously but ranked earlier
    
    # 无关人等滚开
    if now < jobs.End.min() or index < 100:
        return None
        
    # 零、有空位，就不用等。
    # if rj.NNodes.sum() + tj.NNodes <= 36:
    #     # 几乎总是tj.Waited < 0.005
    #     return None
        
    # 一、各种descriptor
    analysis = pd.DataFrame([], index=[index])
    
    analysis.loc[:, 'NNodes'] = tj.NNodes
    analysis.loc[:, 'ETA']    = tj.Timelimit
    
    analysis.loc[:, 'Total_squeue_nnodes'] = rj.NNodes.sum()
    
    analysis.loc[:, 'Total_squeue_eta']    = rj.Timelimit.sum()
    
    analysis.loc[:, 'Self_job_in_squeue_percentage'] = 0 if len(rj)==0 else float(np.sum(rj.User == tj.User)) / len(rj)
    
    analysis.loc[:, 'Waited'] = tj.Waited
    
    return analysis
    
    
L = multiprocessing.Pool(processes=20).map(sample, [(jobs, index, row.copy()) for index, row in tqdm(jobs.iterrows(), total=len(jobs))])
analyses = pd.concat(L) # None is automatically ignored

In [7]:
# 去尾
analyses = analyses[analyses.Waited < analyses.Waited.quantile(q=0.97)].copy()

In [8]:
normalize_analyses = pd.DataFrame(
    sklearn.preprocessing.scale(
        analyses.apply(pd.to_numeric).values
    ), 
    index=analyses.index, 
    columns=analyses.columns
)




In [9]:
class BetterYhatLive(object):
    '''
    Fed a half-trained nn's outputs, it plots the P(E1)/P(E2) progress.
    Optionally, records record_label, P(E1), P(E2) to a global record_filename.
    '''
    
    def __init__(self, smoothen, record_filename=None):
        self.fig, (self.ax1, self.ax2) = plt.subplots(1, 2, figsize=(14, 4.2))
        self.ax2.set_aspect('equal', adjustable='datalim')
        self.line11, self.line12 = [], [] # note: line11 means ax1, line1
        self.smoothen = smoothen
        self.record_filename = record_filename
        
    def update(self, y_train, yhat_train, y_test, yhat_test, record_label=None):
        self.line11.append(sklearn.metrics.r2_score(y_train, yhat_train))
        self.line12.append(sklearn.metrics.r2_score(y_test, yhat_test))
        # smoothen
        if self.smoothen:
            x = range(len(self.line11))
            smoothline11 = statsmodels.nonparametric.smoothers_lowess.lowess(self.line11, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
            smoothline12 = statsmodels.nonparametric.smoothers_lowess.lowess(self.line12, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
        else:
            smoothline11 = self.line11
            smoothline12 = self.line12
        #
        self.ax1.clear()
        self.ax1.plot(smoothline11, label='R2(train)=%.2f'%self.line11[-1])
        self.ax1.plot(smoothline12, label='R2(test)=%.2f'%self.line12[-1])
        self.ax1.legend(loc='best')
        self.ax2.clear()
        self.ax2.set_aspect('equal', adjustable='datalim')
        self.ax2.scatter(y_train, yhat_train, color='green', s=1, alpha=0.2, label='train')
        self.ax2.scatter(y_test, yhat_test, color='red', s=1, alpha=0.2, label='test')
        self.ax2.legend(loc='best')
        self.fig.canvas.draw()
        
        if self.record_filename and record_label:
            with open(self.record_filename, "a") as f:
                f.write("%s, %s, %s\n" %(record_label, self.line11[-1], self.line12[-1]))

In [10]:
class Minibatch(object):
    '''
    Makes batches from dataframes.
    Executes n_epochs before raising StopIteration.
    Allows manually setting aside a test set, or automatically randomly selecting one.
    Progress bar.
    Note: "train set" is preferred to "training set", e.g. train_index, X_train
    '''
    
    
    def __init__(self, df, minibatch_size, n_epochs, train_index=None, test_index=None, train_split=None, test_split=None, tqdm=True):
        self.minibatch_size = minibatch_size
        N = len(df)
        all_index = range(N)
        
        # determine train_index and test_index
        # given index
        if train_index and test_index:
            pass
        elif train_index and not test_index:
            test_index = list(set(all_index) - set(train_index))
        elif test_index and not train_index:
            train_index = list(set(all_index) - set(test_index))
        # given split percentage
        elif train_split and test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            remaining_index = list(set(all_index) - set(train_index))
            test_index = np.random.choice(remaining_index, int(N * test_split), replace=False)
        elif train_split and not test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            test_index = list(set(all_index) - set(train_index))
        elif test_split and not train_split:
            test_index = np.random.choice(all_index, int(N * test_split), replace=False)   
            train_index = list(set(all_index) - set(test_index))
        else:
            raise Exception("Either specify index, or specify split.")
                            
        # generate train_df
        self.train_df = df.iloc[train_index]
        self.test_df = df.iloc[test_index]
        self.df = df

        # minibatch counter
        self.i = 0
        self.iMAX = n_epochs * len(self.train_df) / minibatch_size
        
        if tqdm:
            self.tqdm = tqdm_notebook(total=self.iMAX, leave=False)
        
    def minibatch(self):
        if self.i > self.iMAX:
            self.i = 0
            raise StopIteration
        else:
            self.i += 1
            
        if getattr(self, 'tqdm', None):
            self.tqdm.update(1)
        
        index = np.random.choice(range(len(self.train_df)), self.minibatch_size, replace=False)
        return self.train_df.iloc[index, :-1].values, self.train_df.iloc[index, -1].values.reshape(-1, 1)
    
    def train_set(self):
        return self.train_df.iloc[:, :-1].values, self.train_df.iloc[:, -1].values.reshape(-1, 1)
    
    def test_set(self):
        return self.test_df.iloc[:, :-1].values, self.test_df.iloc[:, -1].values.reshape(-1, 1)

In [80]:
minibatch_size = 1024
n_epochs = 6400
ns_units = [128, 64, 32]
lr = 1E-3

# Graph
tf.reset_default_graph()

X = tf.placeholder(name="X", dtype=tf.float32, shape=[None, 5])

h = X
for n_units in ns_units:
    h = tf.layers.dense(h, units=n_units, activation=tf.nn.elu)
h = tf.layers.dense(h, units=1, activation=None)
yhat = h

y = tf.placeholder(name="y", dtype=tf.float32, shape=[None, 1])

loss = tf.reduce_mean(tf.square(yhat - y), keepdims=False)
train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

# Session
sess = tf.Session()

sess.run(tf.global_variables_initializer())

m = Minibatch(normalize_analyses, minibatch_size, n_epochs, train_index=range(10000), test_index=range(10000, 20000))
b = BetterYhatLive(smoothen=False, record_filename="record_file.txt")
while True:
    try:
        _X, _y = m.minibatch()
        sess.run(train_op, feed_dict = {X: _X, y: _y})
        
        if m.i % 100 == 0:
            X_train, y_train = m.train_set()
            yhat_train = sess.run(yhat, feed_dict = {X: X_train})
            X_test, y_test = m.test_set()
            yhat_test = sess.run(yhat, feed_dict = {X: X_test}) 
            b.update(y_train, yhat_train, y_test, yhat_test, record_label="elu, None, None, %s, %s, %s, AdamOptimizer, %s" %(minibatch_size, n_epochs * m.i / m.iMAX, lr, '-'.join(map(str, ns_units))))
    except StopIteration:
        break

sess.close()

<IPython.core.display.Javascript object>

KeyboardInterrupt: 

In [81]:
results = pd.read_csv(filepath_or_buffer='record_file.txt', sep='\s*,\s*')

  """Entry point for launching an IPython kernel.


In [82]:
fig, ax = plt.subplots()
ax.scatter(results.train_r2, results.test_r2, s=0.5, alpha=0.5)
ax.set_xlim(-1, 1)
ax.set_ylim(-1, 1)

<IPython.core.display.Javascript object>

(-1, 1)