In [65]:
import numpy as np
import pandas as pd
import csv
from math import factorial
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import time
from sklearn.utils import shuffle

In [66]:
def pos(i,prod,L):
    '''
    Compute positive function and gradient information
    
    input:
        i - index of function
        t - iteration
        prod - wt*xt
        
    output:
        fpt - positive function value
        gfpt - positive function gradient
    '''
    fpt = 0.0 
    gfpt = 0.0 
    fpt = (L/2+prod)**i 
    gfpt = i*(L/2+prod)**(i-1) # no xt yet!
    hfpt = i*(i-1)*(L/2+prod)**(i-2)
    return fpt,gfpt,hfpt               

In [67]:
def comb(n, k):
    '''
    Compute combination
    
    input:
        n - total number
        k - number of chosen
    
    output:
        c - number of combination
    '''
    return factorial(n) / factorial(k) / factorial(n - k)

In [68]:
def neg(loss,i,prod,L):
    '''
    Compute negative function and gradient information
    
    input:
        loss - loss function
        i - index of function
        t - iteration
        prod - wt*xt
        
    output:
        fnt - negative function value
        gfnt - negative function gradient
    '''
    fnt = 0.0 # n stands for negative
    gfnt = 0.0
    hfnt = 0.0
    for k in range(i,N+1):
        # compute forward difference
        delta = 0.0
        for j in range(k+1):
            delta += (-1)**(k-j)*comb(k,j)*loss(j/N)
        # compute coefficient
        beta = comb(N,k)*comb(k,i)*(N+1)*delta/(2*L)**k
        # compute function value
        fnt += beta*(L/2-prod)**(k-i)
        # compute gradient
        gfnt += beta*(k-i)*(L/2-prod)**(k-i-1)  # no xt yet!
        # compute hessian
        hfnt += beta*(k-i)*(k-i-1)*(L/2-prod)**(k-i-2)
    return fnt,gfnt,hfnt

In [69]:
def w_grad(gfpt,gfnt,yt,at,bt,alphat):
    '''
    Gradient with respect to w
    
    input:
        fpt - positive function at t
        gfpt - positive function gradient at t
        fnt - negative function at t
        gfnt - negative function gradient at t
        yt - sample label at t
        pt - p at t
        at - a at t
        bt - b at t
        alphat - alpha at t
    output:
        gradwt - gradient w.r.t. w at t
    '''
    gradwt = 0.0
    if yt == 1:
        gradwt = 2*(alphat - at)*gfpt
    else:
        gradwt = 2*(alphat - bt)*gfnt
    return gradwt

In [70]:
def w_hess(hfpt,hfnt,yt,at,bt,alphat):
    hesswt = 0.0
    if yt == 1:
        hesswt = 2*(alphat - at)*hfpt
    else:
        hesswt = 2*(alphat - bt)*hfnt
    return hesswt

In [71]:
def proj(wt,R):
    '''
    Projection
    
    input:
        wt - w at t
        R - radius
        
    output:
        proj - projected wt
    '''
    norm = np.linalg.norm(wt)
    if norm > R:
        wt = wt/norm*R
    return wt

In [72]:
def a_grad(fpt,yt,at):
    '''
    Gradient with respect to a
    
    input:
        fpt - positive function at t
        yt - sample label at t
        pt - p at t
        at - a at t
    
    output:
        gradat - gradient w.r.t a at t
    '''
    gradat = 0.0 
    if yt == 1:
        gradat = 2*(at - fpt)
    else:
        gradat = 2*at
    return gradat

In [73]:
def b_grad(fnt,yt,bt):
    '''
    Gradient with respect to b
    
    input:
        fnt - negative function at t
        yt - sample label at t
        pt - p at t
        bt - b at t
    
    output:
        gradbt - gradient w.r.t b at t
    '''
    gradbt = 0.0 
    if yt == 1:
        gradbt = 2*bt
    else:
        gradbt = 2*(bt - fnt)
    return gradbt

In [74]:
def alpha_grad(fpt,fnt,yt,alphat):
    '''
    Gradient with respect to alpha
    '''
    gradalphat = 0.0
    if yt == 1:
        gradalphat = -2*(alphat - fpt)
    else:
        gradalphat = -2*(alphat - fnt)
    return gradalphat

In [75]:
def loader(filename):
    '''
    Data file loader
    
    input:
        filename - filename
    
    output:
        x - sample features
        y - sample labels
    '''
    # raw data
    L = []
    with open(filename,'r') as file:
        for line in csv.reader(file, delimiter = ' '):
            line[0] = '0:'+line[0]
            line.remove('')
            L.append(dict(i.split(':') for i in line))
    df = pd.DataFrame(L,dtype=float).fillna(0)
    X = df.iloc[:,1:].values
    Y = df.iloc[:,0].values
    # centralize
    mean = np.mean(X,axis=1)
    #X = (X.transpose() - mean).transpose()
    # normalize
    norm = np.linalg.norm(X,axis=1)
    X = X/norm[:,None]
    # convert to binary class
    r = np.ptp(Y)
    index = np.argwhere(Y<r//2)
    INDEX = np.argwhere(Y>=r//2)
    Y[index] = -1
    Y[INDEX] = 1
    Y = Y.astype(int)
    return X,Y

In [76]:
def SOLAM(t,loss,batch,X,Y,L,lam,M,wt,at,bt,alphat):
    '''
    Stochastic Online AUC Maximization step
    
    input:
        T - total number of iteration
        F - objective function value
        loss - loss function
        pt - p at t
        wt - w at t
        at - a at t
        bt - b at t
        alphat - alpha at t
    output:
        W - record of each wt
        A - record of each at
        B - record of each bt
        ALPHA - record of each alphat
    '''
    # Loop in the batch
    peta = 1/t/M
    deta = np.sqrt(np.log(T)/T)
    for k in range(batch):
        
        # Update wt,at,bt
        prod = np.dot(wt,X[k])
        fpt = np.zeros(N+1)
        gfpt = np.zeros(N+1)
        fnt = np.zeros(N+1)
        gfnt = np.zeros(N+1)
        gradwt = 0.0
        gradat = 0.0
        gradbt = 0.0
        gradalphat = 0.0
        
        for i in range(N+1): # add up info of each i
            fpt[i],gfpt[i] = pos(i,prod,L) # partial info
            fnt[i],gfnt[i] = neg(loss,i,prod,L)
            gradwt += w_grad(gfpt[i],gfnt[i],Y[k],at[i],bt[i],alphat[i])
            gradat = a_grad(fpt[i],Y[k],at[i])
            gradbt = b_grad(fnt[i],Y[k],bt[i])
            gradalphat = alpha_grad(fpt[i],fnt[i],Y[k],alphat[i])
            at[i] -= deta*gradat/(N+1)/batch
            bt[i] -= deta*gradbt/(N+1)/batch
            alphat[i] += deta*gradalphat/(N+1)/batch
        
        wt = wt - peta*(gradwt*Y[k]*X[k]/(N+1)/batch + lam*wt) # step size as 1/t gradient descent
        
    wt = proj(wt,L/2)    
        
    return wt,at,bt,alphat

In [90]:
def prox(eta,loss,x,y,L,gamma,lam,wj,aj,bj,alphaj,bwt,bat,bbt,balphat):
    '''
    perform proximal guided gradient descent when receive an sample
    '''
    prod = np.inner(wj,x)
    fpt = np.zeros(N+1)
    gfpt = np.zeros(N+1)
    hfpt = np.zeros(N+1)
    fnt = np.zeros(N+1)
    gfnt = np.zeros(N+1)
    hfnt = np.zeros(N+1)
    gradwt = 0.0
    gradat = 0.0
    gradbt = 0.0
    gradalphat = 0.0
    hesswt = 0.0
    for i in range(N+1):
        fpt[i],gfpt[i],hfpt[i] = pos(i,prod,L)
        fnt[i],gfnt[i],hfnt[i] = neg(loss,i,prod,L)
        gradwt += w_grad(gfpt[i],gfnt[i],y,aj[i],bj[i],alphaj[i])# accumulate i
        hesswt += w_hess(hfpt[i],hfnt[i],y,aj[i],bj[i],alphaj[i])
        gradat = a_grad(fpt[i],y,aj[i])
        gradbt = b_grad(fnt[i],y,bj[i])
        gradalphat = alpha_grad(fpt[i],fnt[i],y,alphaj[i])
        aj[i] = aj[i] - eta*(gradat/(N+1)+gamma*(aj[i]-bat[i]))
        bj[i] = bj[i] - eta*(gradbt/(N+1)+gamma*(bj[i]-bbt[i]))
        alphaj[i] = alphaj[i] + eta*gradalphat/(N+1)
    hessian = hesswt*np.outer(x,x)
    # eigen,_ = np.linalg.eig(hessian)
    
    # print('minimum eigenvalue: %f' %(np.min(eigen)))
    wj = wj - eta*(gradwt*x*y/(N+1) + lam*wj + gamma*(wj - bwt))
    wj = proj(wj,L/2)
    #aJ = proj(aJ,1)
    #bJ = proj(bJ,1)
    #alphaJ = proj(alphaJ,1)
    
    return wj,aj,bj,alphaj

In [91]:
def PGSPD(t,loss,X,Y,L,gamma,lam,M,bwt,bat,bbt,balphat):
    '''
    Proximally Guided Stochastic Primal Dual Algorithm
    '''
    
    # initialize inner loop variables
    Wt = bwt+0.0
    At = bat+0.0
    Bt = bbt+0.0
    ALPHAt = balphat+0.0
    
    BWt = Wt+0.0
    BAt = At+0.0
    BBt = Bt+0.0
    BALPHAt = ALPHAt+0.0
    
    ETAt = 1/t/M # M is the bound for gradient
    # inner loop update at j
    for j in range(t): 
        # update inner loop variables
        Wt,At,Bt,ALPHAt = prox(ETAt,loss,X[j],Y[j],L,gamma,lam,Wt,At,Bt,ALPHAt,bwt,bat,bbt,balphat)
        BWt += Wt
        BAt += At
        BBt += Bt
        BALPHAt += ALPHAt
        
    # update outer loop variables
    bwt = BWt/t
    bat = BAt/t
    bbt = BBt/t
    balphat = BALPHAt/t
    
    return bwt,bat,bbt,balphat

In [92]:
def split(FEATURES,LABELS,folder,folders):
    
    if folder > folders:
        print('Exceed maximum folders!')
        return
    # load and split data
    #FEATURES,LABELS = loader(dataset)
    n,d = FEATURES.shape
    # regular portion of each folder
    portion = round(n/folders)
    start = portion*folder
    stop = portion*(folder+1)
    if np.abs(stop - n) < portion: # remainder occurs
        X_train = FEATURES[:start,:]
        Y_train = LABELS[:start]
        X_test = FEATURES[start:,:]
        Y_test = LABELS[start:]
    else:
        mask = np.ones(n, bool)
        mask[start:stop] = False
        X_train = FEATURES[mask,:]
        Y_train = LABELS[mask]
        X_test = FEATURES[start:stop]
        Y_test = LABELS[start:stop]
    # get dimensions of the data
    n,_ = X_train.shape
    # number of epoch
    epoch = T//n+1
    # augment by epoch
    #X_train_augmented = np.tile(X_train,(epoch,1)) # might have memory burden
    #Y_train_augmented = np.tile(Y_train,epoch)
    
    #return X_train_augmented,X_test,Y_train_augmented,Y_test
    return X_train,X_test,Y_train,Y_test

In [93]:
def demo(X_train_augmented,X_test,Y_train_augmented,Y_test,loss,alg,L = 2.0,gamma=0.0,lam=0.0,M=1.0,WT=0,AT=0,BT=0,ALPHAT=0):
    '''
    Run it to get results
    '''
    # define loss function
    if loss == 'hinge':
        loss = lambda x:max(0,1+L-2*L*x)
    elif loss == 'logistic':
        loss = lambda x:np.log(1+np.exp(L-2*L*x))
    else:
        print('Wrong loss function!')
        return
    
    # get dimensions of the data
    num,d = X_train_augmented.shape
    
    # initialize outer loop variables
    if type(WT) == int:
        WT = np.random.rand(d) # d is the dimension of the features
        AT = np.random.rand(N+1)
        BT = np.random.rand(N+1)
        ALPHAT = np.random.rand(N+1)

    # record auc
    roc_auc = np.zeros(T)
    # record time elapsed
    start_time = time.time()
    for t in range(1,T+1):
        
        if alg == 'PGSPD':
            if t<num:
                begin = (t*(t-1)//2)%num
                end = (t*(t+1)//2)%num
                if begin < end:
                    x_train = X_train_augmented[begin:end]
                    y_train = Y_train_augmented[begin:end]
                else: # need to think better
                    #s2 = time.time()
                    x_train = np.append(X_train_augmented[begin:],X_train_augmented[:end],axis=0)
                    y_train = np.append(Y_train_augmented[begin:],Y_train_augmented[:end],axis=0)
                    #e2 = time.time() - s2
                    #print('append time : %f' %(e1))
                x_train, y_train = shuffle(x_train,y_train)
                # update outer loop variables
                WT,AT,BT,ALPHAT = PGSPD(t,loss,x_train,y_train,L,gamma,lam,M,WT,AT,BT,ALPHAT)
            else:
                x_train, y_train = shuffle(X_train_augmented,Y_train_augmented)
                WT,AT,BT,ALPHAT = PGSPD(num,loss,x_train,y_train,L,gamma,lam,M,WT,AT,BT,ALPHAT)
            '''
            # sample a point
            begin = ((t-1)*num//batch)%num
            end = (t*num//batch)%num
            if begin < end:
                x_train = X_train_augmented[begin:end]
                y_train = Y_train_augmented[begin:end]
            else: # need to think better
                x_train = np.append(X_train_augmented[begin:],X_train_augmented[:end],axis=0)
                y_train = np.append(Y_train_augmented[begin:],Y_train_augmented[:end],axis=0)
            # update outer loop variables
            WT,AT,BT,ALPHAT = PGSPD(num//batch,loss,x_train,y_train,L,gamma,lam,M,WT,AT,BT,ALPHAT)
            '''
        elif alg == 'SOLAM':
            # sample a point
            begin = (t-1)*batch%num
            end = t*batch%num
            if begin < end:
                x_train = X_train_augmented[begin:end]
                y_train = Y_train_augmented[begin:end]
            else: # need to think better
                x_train = np.append(X_train_augmented[begin:],X_train_augmented[:end],axis=0)
                y_train = np.append(Y_train_augmented[begin:],Y_train_augmented[:end],axis=0)
            WT,AT,BT,ALPHAT = SOLAM(t,loss,batch,x_train,y_train,L,lam,M,WT,AT,BT,ALPHAT)
            
        fpr, tpr, _ = roc_curve(Y_test, np.dot(X_test,WT))
        roc_auc[t-1] = auc(fpr, tpr)
        if t%100 == 0:
            elapsed_time = time.time() - start_time
            print('iteration: %d AUC: %f time eplapsed: %f' %(t,roc_auc[t-1],elapsed_time))
            start_time = time.time()
    
    return WT,AT,BT,ALPHAT,roc_auc

In [94]:
def cv(dataset,loss,alg,folders,L = 2.0,gamma=0.0,lam=0.0,M=1.0,WT=0,AT=0,BT=0,ALPHAT=0):
    '''
    Cross validation
    '''
    # Load data set
    FEATURES,LABELS = loader(dataset) 
    
    # record auc
    AUC_ROC = np.zeros(folders)
    
    # cross validation
    for folder in range(folders):
        print('folder = %d' %(folder))
        X_train_augmented,X_test,Y_train_augmented,Y_test = split(FEATURES,LABELS,folder,folders)
        
        _,_,_,_,roc_auc = demo(X_train_augmented,X_test,Y_train_augmented,Y_test,loss,alg,L=L,gamma=gamma,lam=lam,M=M,WT=WT,AT=AT,BT=BT,ALPHAT=ALPHAT)
        AUC_ROC[folder] = np.max(roc_auc)
    print('auc score: %f +/- %f' %(np.mean(AUC_ROC),np.std(AUC_ROC)))
    return AUC_ROC

In [101]:
def single_run(para):
    folder,l,gamma,lam,m,paras = para
    X_train_augmented,X_test,Y_train_augmented, Y_test,loss,alg = paras
    _,_,_,_,roc_auc = demo(X_train_augmented,
                           X_test,
                           Y_train_augmented,
                           Y_test,
                           loss,
                           alg,
                           L=L[l],gamma=GAMMA[gamma],lam=LAM[lam],M=M[m])
    return folder,l,gamma,lam,m, np.max(roc_auc)

def removed():
    
    # cross validation
    for folder in range(folders):
        print('folder = %d' %(folder))
        X_train_augmented,X_test,Y_train_augmented,Y_test = split(FEATURES,LABELS,folder,folders)
        
        # bloody grid search!
        for l in range(len(L)):
            for gamma in range(len(GAMMA)):
                for lam in range(len(LAM)):
                    for m in range(len(M)):
                        print('current parameters: L = %.2f, GAMMA = %.2f, LAM = %.2f, M = %.2f' %(L[l],GAMMA[gamma],LAM[lam],M[m]))
    
def gs(dataset,loss,alg,folders,L=[2.0],GAMMA=[0.0],LAM=[0.0],M=[1.0],WT=0,AT=0,BT=0,ALPHAT=0):
    '''
    Grid search! Wuss up fellas?!
    '''
    # Load data set
    from itertools import product
    import multiprocessing
    FEATURES,LABELS = loader(dataset) 
    # record auc
    num_cpus = 2
    AUC_ROC = np.zeros((folders,len(L),len(GAMMA),len(LAM),len(M)))
    input_paras = []
    for folder in range(folders):
        X_train_augmented,X_test,Y_train_augmented,Y_test = split(FEATURES,LABELS,folder,folders)
        paras = X_train_augmented,X_test,Y_train_augmented, Y_test,loss,alg
        for l,gamma,lam,m in product(range(len(L)),range(len(GAMMA)),range(len(LAM)),range(len(M))):
            input_paras.append((folder,l,gamma,lam,m,paras))
    print('how many paras: %d' % len(input_paras))
    pool = multiprocessing.Pool(processes=num_cpus)
    results_pool = pool.map(single_run,input_paras)
    pool.close()
    pool.join()
    for folder,l,gamma,lam,m, auc in results_pool:
        AUC_ROC[folder][l][gamma][lam][m] = auc
                        
    return AUC_ROC

In [102]:
N=10
T=200
batch=2

In [103]:
L=[2.0,20.0]
GAMMA = [0.1,1.0]
LAM = [0.1,1.0]
M = [0.1,1.0]

In [104]:
T = 100
diabetes_hinge = gs('diabetes','hinge','PGSPD',5,L,GAMMA,LAM,M)

how many paras: 80
iteration: 100 AUC: 0.752037 time eplapsed: 14.868645
iteration: 100 AUC: 0.771296 time eplapsed: 14.957772
iteration: 100 AUC: 0.746852 time eplapsed: 15.330565
iteration: 100 AUC: 0.749074 time eplapsed: 15.406876
iteration: 100 AUC: 0.751111 time eplapsed: 15.682979
iteration: 100 AUC: 0.770926 time eplapsed: 15.830920
iteration: 100 AUC: 0.655185 time eplapsed: 14.858499
iteration: 100 AUC: 0.781852 time eplapsed: 14.759761
iteration: 100 AUC: 0.740556 time eplapsed: 15.968044
iteration: 100 AUC: 0.741111 time eplapsed: 16.001140
iteration: 100 AUC: 0.746111 time eplapsed: 15.494668
iteration: 100 AUC: 0.719815 time eplapsed: 15.516332
iteration: 100 AUC: 0.681771 time eplapsed: 16.279061
iteration: 100 AUC: 0.751852 time eplapsed: 16.297339
iteration: 100 AUC: 0.748958 time eplapsed: 15.454711
iteration: 100 AUC: 0.713519 time eplapsed: 15.437275
iteration: 100 AUC: 0.715104 time eplapsed: 15.585638
iteration: 100 AUC: 0.760185 time eplapsed: 15.561741
iteration

In [105]:
diabetes_hinge

array([[[[[0.81      , 0.78074074],
          [0.80759259, 0.81148148]],

         [[0.79592593, 0.785     ],
          [0.78944444, 0.77148148]]],


        [[[0.78944444, 0.7787037 ],
          [0.79555556, 0.79925926]],

         [[0.76722222, 0.78666667],
          [0.78962963, 0.78388889]]]],



       [[[[0.78003472, 0.78402778],
          [0.75902778, 0.74878472]],

         [[0.74826389, 0.77048611],
          [0.76510417, 0.73680556]]],


        [[[0.75225694, 0.74305556],
          [0.76440972, 0.75190972]],

         [[0.73350694, 0.75034722],
          [0.74809028, 0.73402778]]]],



       [[[[0.82962561, 0.85530837],
          [0.82691264, 0.84861639]],

         [[0.79291011, 0.74064026],
          [0.77192982, 0.68204015]]],


        [[[0.79761259, 0.77482366],
          [0.82166757, 0.83034907]],

         [[0.76867426, 0.76505697],
          [0.71260626, 0.75438596]]]],



       [[[[0.82871422, 0.85294563],
          [0.82886567, 0.832147  ]],

         [[0.7722247