In [1]:
import numpy as np
import pandas as pd
import torch
from math import factorial

In [2]:
# Read data from file and seperate x and y
def loader(filename):
    '''
    Data file loader
    
    input:
        filename - filename
    
    output:
        x - sample features
        y - sample labels
    '''
    raw_df = pd.read_csv(filename,header=None,sep = '\s+|:',engine='python')
    y = torch.tensor(raw_df[0].values)
    x = torch.tensor(raw_df[raw_df.columns[2::2]].values)
    return x,y

In [3]:
x,y = loader('diabetes')

In [4]:
L = 1.0 # range
N = 10 # degree

In [5]:
def pos(i,t,prod):
    '''
    Compute positive function and gradient information
    
    input:
        i - index of function
        t - iteration
        prod - wt*xt
        
    output:
        fpt - positive function value
        gfpt - positive function gradient
    '''
    fpt = 0.0 
    gfpt = 0.0 
    fpt = (L/2+prod)**i 
    gfpt = i*(L/2+prod)**(i-1) # no xt yet!
    return fpt,gfpt               

In [6]:
# Can be rewrite as a lambda function
def hinge(x):
    '''
    hinge loss function
    
    input:
        x - x value
    
    output:
        phi - hinge loss value
    '''
    phi = max(0,x)
    return phi

In [7]:
def comb(n, k):
    '''
    Compute combination
    
    input:
        n - total number
        k - number of chosen
    
    output:
        c - number of combination
    '''
    return factorial(n) / factorial(k) / factorial(n - k)

In [9]:
def neg(loss,i,t,prod):
    '''
    Compute negative function and gradient information
    
    input:
        loss - loss function
        i - index of function
        t - iteration
        prod - wt*xt
        
    output:
        fnt - negative function value
        gfnt - negative function gradient
    '''
    fnt = 0.0 # n stands for negative
    gfnt = 0.0
    for k in range(i,N+1):
        # compute forward difference
        delta = 0.0
        for j in range(k+1):
            delta += (-1)**(k-j)*comb(k,j)*loss(i/N)
        # compute coefficient
        beta = comb(N,k)*comb(k,i)*(N+1)*delta/(2*L)**k
        # compute function value
        fnt += beta*(L/2-prod)**(k-i)
        # compute gradient
        gfnt += beta*(k-i)*(L/2-prod)**(k-i-1)  # no xt yet!
    return fnt,gfnt

In [10]:
def p_hat(t,yt,ptm1):
    '''
    Approximate probability
    
    input:
        t - iteration
        yt - label at t
        ptm1 - p at t-1
    
    output:
        pt - p at t
    '''
    pt = ((t-1)*ptm1 + (yt+1)/2)/t # m stands for minus
    return pt

In [11]:
def a_hat(t,fpt,yt,ptm1,atm1):
    '''
    Approximate primal a
    
    input:
        t - iteration
        fpt - positive function at t
        yt - sample label at t
        ptm1 - p at t-1
        atm1 - a at t-1
    
    output:
        at - a at t
    '''
    at = (fpt*((yt+1)/2) + (t-1)*ptm1*atm1)/t # do not update pt yet!
    return at

In [12]:
def b_hat(t,fmt,yt,ptm1,btm1):
    '''
    Approximate primal b
    
    input:
        t - iteration
        fmt - negative function at t
        yt - sample label at t
        ptm1 - p at t-1
        btm1 - b at t-1
    
    output:
        bt - b at t-1
    '''
    bt = (fmt*((-yt+1)/2) + (t-1)*(1-ptm1)*btm1)/t # indicator of y=-1!
    return bt

In [13]:
def alpha_step(t,at,bt):
    '''
    Compute dual alpha
    
    input:
        t - iteration
        at - a at t
        bt - b at t
        
    output:
        alphat - alpha at t
    '''
    alphat = at + bt
    return alphat

In [16]:
def w_grad(fpt,gfpt,fnt,gfnt,yt,pt,at,bt,alphat):
    '''
    Gradient with respect to w
    
    input:
        fpt - positive function at t
        gfpt - positive function gradient at t
        fnt - negative function at t
        gfnt - negative function gradient at t
        yt - sample label at t
        pt - p at t
        at - a at t
        bt - b at t
        alphat - alpha at t
    output:
        gradwt - gradient w.r.t. w at t
    '''
    gradwt = 0.0
    if yt == 1:
        gradwt = 2*alphat*(1-pt)*gfpt + 2*(1-pt)*(fpt-at)*gfpt - 2*(1-pt)*fpt*gfpt
    else:
        gradwt = 2*alphat*pt*gfnt + 2*pt*(fnt-bt)*gfnt - 2*pt*fnt*gfnt
    return gradwt

In [17]:
def a_grad(fpt,yt,pt,at):
    '''
    Gradient with respect to a
    
    input:
        fpt - positive function at t
        yt - sample label at t
        pt - p at t
        at - a at t
    
    output:
        gradat - gradient w.r.t a at t
    '''
    gradat = 0.0 
    if yt == 1:
        gradat = -2*(1-pt)*(fpt-at)
    else:
        pass
    return gradat

In [18]:
def b_grad(fnt,yt,pt,bt):
    '''
    Gradient with respect to b
    
    input:
        fnt - negative function at t
        yt - sample label at t
        pt - p at t
        bt - b at t
    
    output:
        gradbt - gradient w.r.t b at t
    '''
    gradbt = 0.0 
    if yt == 1:
        pass
    else:
        gradbt = -2*pt*(fnt-bt)
    return gradbt

In [19]:
def SOLAM(T,loss,pt = 0.0,wt = 0.0,at = 0.0,bt = 0.0,alphat = 0.0):
    '''
    Stochastic Online AUC Maximization
    
    input:
        T - total number of iteration
        loss - loss function
        pt - p at t
        wt - w at t
        at - a at t
        bt - b at t
        alphat - alpha at t
    output:
        W - record of each wt
        A - record of each at
        B - record of each bt
        ALPHA - record of each alphat
    '''
    # Initialize W
    W = torch.zeros(T)
    A = torch.zeros(T)
    B = torch.zeros(T)
    ALPHA = torch.zeros(T)
    
    # Loop
    for t in range(1,T+1):
        
        # Update pt
        pt = p_hat(t,y[t],pt)
        
        # Update wt,at,bt
        prod = wt*x[t]
        fpt = torch.zeros(N+1)
        gfpt = torch.zeros(N+1)
        fnt = torch.zeros(N+1)
        gfnt = torch.zeros(N+1)
        gradwt = 0.0
        gradat = 0.0
        gradbt = 0.0
        
        for i in range(N+1): # add up info of each i
            fpt[i],gfpt[i] = pos(i,t,prod) # partial info
            gfpt[i] *= x[t] # get xt now!
            fnt[i],gfnt[i] = neg(loss,i,t,prod)
            gfnt[i] *= x[t] 
            gradwt += w_grad(fpt[i],gfpt[i],fnt[i],gfnt[i],yt,pt,at,bt,alphat) 
            gradat += a_grad(fpt[i],yt,pt,at)
            gradbt += b_grad(fnt[i],yt,pt,bt)
            
        wt -= 1/t*gradwt/(N+1) # step size as 1/t gradient descent
        at -= 1/t*gradat/(N+1)
        bt -= 1/t*gradbt/(N+1)
        alphat = at+bt

        W[t-1] = wt
        A[t-1] = at
        B[t-1] = bt
        ALPHA[t-1] = alphat
    return W,A,B,ALPHA