This notebook curates a list of helper functions. When a project is archived, reusable functions go here.

In [None]:
# Numpy
import numpy as np
import pandas as pd
import scipy.stats, scipy.interpolate, scipy.spatial

# matplotlib
%matplotlib nbagg
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.patches as patches

# plotly
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)

# Machine learning
import tensorflow as tf
import keras
import sklearn
import sklearn.preprocessing, sklearn.base, sklearn.utils, sklearn.model_selection, sklearn.gaussian_process, sklearn.linear_model
import optunity
import statsmodels.nonparametric.smoothers_lowess

# Various Python tricks and libraries
import re
import requests
import time
import functools
import operator
import collections
from tqdm import tqdm, tqdm_notebook, tnrange
import dill as pickle
import IPython
import gc
import json
from collections import OrderedDict

# Parallel
# import joblib
# import multiprocessing
import pathos

# sklearn plus

In [None]:
class MultiPipeline():
    '''
    Same as sklearn's pipeline except:
    - does not require 2 inputs and outputs. 
      allows e.g. 3 inputs and outputs, as long as *args match.
    - requires only fit() and transform()
    - returns state list
    - not too inefficient
    
    Supports *args but not **kwargs.
    '''
    
    def __init__(self, ETs):
        self.ETs = Estimators_Transformers = ETs
      
    def fit(self, *args):
        '''Note: does not take keyworded input.'''
        for ET in self.ETs:
            args = ET.fit(*args).transform(*args)
        return self

    def transform(self, *args):
        for ET in self.ETs:
            args = ET.transform(*args)
        return args
    
    def transforms(self, *args):
        ET_s = []
        for ET in self.ETs:
            args = ET.transform(*args)
            ET_s.append(args)
        return ET_s
    
    def fit_transform(self, *args):
        for ET in self.ETs:
            args = ET.fit(*args).transform(*args)
        return args
    
    def fit_transforms(self, *args):
        ET_s = []
        for ET in self.ETs:
            args = ET.fit(*args).transform(*args)
            ET_s.append(args)
        return ET_s

In [None]:
class RnnStandardScaler():
    '''
    StandardScaler, but for RNN-style input, i.e. None * Irregular * Nfeatures.
    
    Unlike LabelEncoder, sometimes StandardScaler'ing the original DataFrame is not possible.
    
    Supports neither *args nor **kwargs.
    '''
    
    def __init__(self, processes=20):
        self.processes = processes
        self.scaler = sklearn.preprocessing.StandardScaler()
        
    def fit(self, X):
        assert isinstance(X[0][0][0], float)
        
        X_ = np.concatenate(X, axis=0)
        assert not np.isnan(X_).any()
        self.scaler.fit(X_)
        
        return self
    
    def transform(self, X):
        assert isinstance(X[0][0][0], float)
        
        if self.processes:
            return pathos.multiprocessing.ProcessPool(nodes=self.processes).map(lambda _: list(self.scaler.transform(_)), X)
        else:
            return map(lambda _: list(self.scaler.transform(_)), X)
        
    def fit_transform(self, X):
        return self.fit(X).transform(X)
        

# visualizing ML

In [None]:
def Iterator(df, *args, **kwargs):
    '''
    Wrapper around RnnIterator. Takes a Dataframe.
    
    I am a class.
    '''
    data = [df.iloc[:, :-1].values, df.iloc[:, -1].values.reshape(-1, 1)]
    return RnnIterator(data, *args, **kwargs)

In [19]:
class RnnIterator(object):
    '''
    Takes [(None, ...), (None, ...), (None, ...)] format data.
    Train-test split by train_index, test_index, train_split and/or test_split.
    Make batches.
    Executes n_epochs before raising StopIteration.
    Progress bar.
    
    Note: 
    "Train data" is preferred to "training data".
    
    Variables:
    train_data, test_data
    n_minibatches, i_minibatch
    n_epochs, @property i_epoch
    '''
    
    
    def __init__(self, data, minibatch_size, n_epochs, train_index=None, test_index=None, train_split=None, test_split=None, tqdm=True):
        self.minibatch_size = minibatch_size
        self.n_epochs = n_epochs
        N = len(data[0])
        all_index = range(N)
        
        # data shape consistency check
        data = [np.float32(_) for _ in data]
        assert len(np.unique([len(_) for _ in data])) == 1
        assert data[-1].ndim >= 2 # y is (None, 1), not (None)
        
        # determine train_index and test_index
        # given index
        if train_index and test_index:
            pass
        elif train_index and not test_index:
            test_index = list(set(all_index) - set(train_index))
        elif test_index and not train_index:
            train_index = list(set(all_index) - set(test_index))
        # given split percentage
        elif train_split and test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            remaining_index = list(set(all_index) - set(train_index))
            test_index = np.random.choice(remaining_index, int(N * test_split), replace=False)
        elif train_split and not test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            test_index = list(set(all_index) - set(train_index))
        elif test_split and not train_split:
            test_index = np.random.choice(all_index, int(N * test_split), replace=False)   
            train_index = list(set(all_index) - set(test_index))
        else:
            raise Exception("Either specify index, or specify split.") 
                            
        # generate train_data
        self.train_data = [_[train_index, ...] for _ in data]
        self.test_data = [_[test_index, ...] for _ in data]

        # minibatch counter
        self.i_minibatch = 0
        self.n_minibatches = n_epochs * len(self.train_data[0]) / minibatch_size
        
        if tqdm:
            self.tqdm = tqdm_notebook(total=self.n_minibatches, leave=False)
        
    def minibatch(self):
        if self.i_minibatch > self.n_minibatches:
            self.i_minibatch = 0
            raise StopIteration
        else:
            self.i_minibatch += 1
            
        if getattr(self, 'tqdm', None):
            self.tqdm.update(1)
        
        index = np.random.choice(range(len(self.train_data[0])), self.minibatch_size, replace=False)
        return [_[index, ...] for _ in self.train_data]
    
    @property
    def i_epoch(self):
        # the number of epochs
        return float(self.i_minibatch) * self.n_epochs / self.n_minibatches

In [None]:
class BetterYhatLive(object):
    '''
    On-the-run r2 monitoring.
    
    Plot (i_epoch, r2).
    Plot (y, yhat).
    '''
    
    def __init__(self, smoothen):
        self.fig, (self.ax_decay, self.ax_corr) = plt.subplots(1, 2, figsize=(14, 4.2))
        self.ax_corr.set_aspect('equal', adjustable='datalim')
        self.i_epochs, self.line_decay_train, self.line_decay_test = [], [], []
        self.smoothen = smoothen
        
    def update(self, i_epoch, y_train, yhat_train, y_test, yhat_test):
        self.i_epochs.append(i_epoch)
        self.line_decay_train.append(sklearn.metrics.r2_score(y_train, yhat_train))
        self.line_decay_test.append(sklearn.metrics.r2_score(y_test, yhat_test))
        # smoothen
        if self.smoothen:
            x = range(len(self.line_decay_train))
            smoothline_decay_train = statsmodels.nonparametric.smoothers_lowess.lowess(self.line_decay_train, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
            smoothline_decay_test = statsmodels.nonparametric.smoothers_lowess.lowess(self.line_decay_test, x, is_sorted=True, frac=0.25, it=1, return_sorted=False)
        else:
            smoothline_decay_train = self.line_decay_train
            smoothline_decay_test = self.line_decay_test
        #
        label_train = '$r^2_{train}$=%.2f'%self.line_decay_train[-1]
        label_test = '$r^2_{test}$=%.2f'%self.line_decay_test[-1]
        #
        self.ax_decay.clear()
        self.ax_decay.plot(self.i_epochs, smoothline_decay_train, label=label_train)
        self.ax_decay.plot(self.i_epochs, smoothline_decay_test, label=label_test)
        self.ax_decay.legend(loc='best')
        #
        self.ax_corr.clear()
        self.ax_corr.set_aspect('equal', adjustable='datalim')
        self.ax_corr.scatter(y_train, yhat_train, color='green', s=1, alpha=0.2, label='train')
        self.ax_corr.scatter(y_test, yhat_test, color='red', s=1, alpha=0.2, label='test')
        self.ax_corr.legend(loc='best')
        #
        self.fig.canvas.draw()

In [None]:
class LambdaGraph(object):
    
    def __init__(self):
        self.bs = OrderedDict()
        
    def addv(self, b, label): # b is a BetterYhatLive instance
        if label in self.bs:
            raise Exception("label %s already in self.bs" %label)
        self.bs[label] = np.float32(zip(b.i_epochs, b.line_decay_train, b.line_decay_test))
        
#     def draw(self):
#         traces = [
#             go.Scatter(x = b[:, 1], y = b[:, 2], mode = 'markers', name = label)
#             for label, b in l.bs.iteritems()
#         ]
#         py.iplot(traces, filename='threshold_errors')

# misc

In [None]:
class KerasBetterYhatLive(keras.callbacks.Callback):
    '''
    The Keras version of BetterYhatLive. Known as TqdmProgBar.
    
    features:
    1. tqdm ETA bar
    2. logs[field] plotted for each field in fields
    '''
    
    def __init__(self, n_epochs, fields): 
        self.n_epochs = n_epochs
        self.fields = fields
        
        self.fields_history = dict((field, []) for field in fields)
        self.fig, self.ax = plt.subplots(1, 1)
        
    def on_train_begin(self, logs):
        self.pbar = tqdm_notebook(total=self.n_epochs, leave=False)
        
    def on_train_end(self, logs):
        self.pbar.close()
        
    def on_epoch_end(self, epoch, logs, log_interval = 40):
        if epoch % log_interval == 0:
            self.pbar.update(log_interval)

            for field in self.fields:
                self.fields_history[field].append(logs[field])

            self.ax.clear()
            for field in self.fields:
                self.ax.plot(self.fields_history[field], label="%s=%.2f" %(field, self.fields_history[field][-1]))
            self.ax.legend(loc='best')
            self.fig.canvas.draw()
        

In [None]:
def r2_score(ytrue, ypred): # sklearn.metrics.r2_score in tensorflow. 1 output only. 

    ytrue_mean = tf.reduce_mean(ytrue, name="ytrue_mean")
    r2_score = tf.subtract(1., tf.truediv(tf.reduce_mean((ytrue - ypred) ** 2), tf.reduce_mean((ytrue - ytrue_mean) ** 2)), name="r2_score")
    return r2_score


# deprecated

In [19]:
class Iterator(object):
    '''
    Replaced with a wrapper around RnnIterator.
    
    Allows manually setting aside a test set, or automatically randomly selecting one.
    Makes batches from a dataframe, where first N-1 columns are features and last column is label.
    Executes n_epochs before raising StopIteration.
    Progress bar.
    
    Note:  
    "train data" is preferred to "training data".
    "index" refer to "iloc", not "loc" in dataframe.

    Variables:
    train_data, test_data
    n_minibatches, i_minibatch
    n_epochs, i_epoch
    '''
    
    
    def __init__(self, df, minibatch_size, n_epochs, train_index=None, test_index=None, train_split=None, test_split=None, tqdm=True):
        self.minibatch_size = minibatch_size
        self.n_epochs = n_epochs
        N = len(df)
        all_index = range(N)
        
        # determine train_index and test_index
        # given index
        if train_index and test_index:
            pass
        elif train_index and not test_index:
            test_index = list(set(all_index) - set(train_index))
        elif test_index and not train_index:
            train_index = list(set(all_index) - set(test_index))
        # given split percentage
        elif train_split and test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            remaining_index = list(set(all_index) - set(train_index))
            test_index = np.random.choice(remaining_index, int(N * test_split), replace=False)
        elif train_split and not test_split:
            train_index = np.random.choice(all_index, int(N * train_split), replace=False)
            test_index = list(set(all_index) - set(train_index))
        elif test_split and not train_split:
            test_index = np.random.choice(all_index, int(N * test_split), replace=False)   
            train_index = list(set(all_index) - set(test_index))
        else:
            raise Exception("Either specify index, or specify split.") 
                            
        # generate train_df
        self.train_df = df.iloc[train_index]
        self.test_df = df.iloc[test_index]
        self.df = df

        # minibatch counter
        self.i_minibatch = 0
        self.n_minibatches = n_epochs * len(self.train_df) / minibatch_size
        
        if tqdm:
            self.tqdm = tqdm_notebook(total=self.n_minibatches, leave=False)
        
    def minibatch(self):
        if self.i_minibatch > self.n_minibatches:
            self.i_minibatch = 0
            raise StopIteration
        else:
            self.i_minibatch += 1
            
        if getattr(self, 'tqdm', None):
            self.tqdm.update(1)
        
        index = np.random.choice(range(len(self.train_df)), self.minibatch_size, replace=False)
        return self.train_df.iloc[index, :-1].values, self.train_df.iloc[index, -1].values.reshape(-1, 1)
    
    @property
    def train_data(self):
        return self.train_df.iloc[:, :-1].values, self.train_df.iloc[:, -1].values.reshape(-1, 1)
    
    @property
    def test_data(self):
        return self.test_df.iloc[:, :-1].values, self.test_df.iloc[:, -1].values.reshape(-1, 1)
    
    @property
    def i_epoch(self):
        # the number of epochs
        return float(self.i_minibatch) * self.n_epochs / self.n_minibatches

In [None]:
# iterable to list (low performance, please avoid!)

def is_iterable(L):
    return hasattr(L, '__iter__')

def to_iterable(L):
    if isinstance(L, (np.ndarray, np.generic)):
        return L.tolist()
    if isinstance(L, pd.DataFrame):
        return L.values.tolist()
    if is_iterable(L):
        return recursive_map(lambda x:x, L)
    raise ValueError
    
def flatten(L):
    return reduce(operator.add, map(lambda l: flatten(l) if is_iterable(l) else [l], L))

def recursive_map(func, L):
    return map(lambda l: recursive_map(func, l) if is_iterable(l) else func(l), L)

def get_index(item, L, index_unexpected=-1, random_unexpected=0.): # first occurence or -1
    return index_unexpected if np.random.rand() < random_unexpected or item not in L else np.argmax(np.array(L)==item)

def get_value(index, L, value_unexpected_index=None, index_unexpected=-1): # L[index] or None
    return value_unexpected_index if index == index_unexpected else L[index]


In [None]:
# low performance, please avoid!

class LabelEncoder(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): 
    # sklearn.preprocessing.LabelEncoder with irregular 2D array, unexpected index or value, and random -1 return.
    
    def __init__(self, index_unexpected=-1, random_unexpected=0., value_unexpected_index=None):
        self.index_unexpected = index_unexpected
        self.random_unexpected = random_unexpected
        self.value_unexpected_index = value_unexpected_index
        
    def fit(self, y):
        y_flattened = flatten(y)
        self.classes_ = np.unique(y_flattened)
        return self

    def transform(self, y):
        func = functools.partial(get_index, 
                                 L=self.classes_, 
                                 index_unexpected=self.index_unexpected, 
                                 random_unexpected=self.random_unexpected)
        return recursive_map(
            func=lambda item: get_index(
                item=item, 
                L=self.classes_, 
                index_unexpected=self.index_unexpected, 
                random_unexpected=self.random_unexpected),
            L=y)

    def inverse_transform(self, y):
        return recursive_map(
            func=lambda index: get_value(
                index=index, 
                L=self.classes_, 
                index_unexpected=self.index_unexpected, 
                random_unexpected=self.random_unexpected),
            L=y)
    