# Brick algorithm

# Mathis

In [1]:
# Line search
import os
os.chdir('/Users/Linger/Desktop/ENSAE_MS/S1/Données_massives/Distributed_Coordinate_Descent_for_Logistic_Lasso')

import pandas as pd
data = pd.read_csv('Iris.csv')

data['Species'] = data['Species'].map({'Iris-virginica': 1, 'Iris-versicolor': 1, 'Iris-setosa':-1})
data = data.drop(['Id'], axis = 1)

In [2]:
# These are the data I use to compute the functional we want to minimize (i.e f(Beta))
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,-1
1,4.9,3.0,1.4,0.2,-1
2,4.7,3.2,1.3,0.2,-1
3,4.6,3.1,1.5,0.2,-1
4,5.0,3.6,1.4,0.2,-1


In [3]:
# Import modules
import numpy as np

In [4]:
# Here I create the data matrix X, vector of response Y and vector of coefficients beta
Y = data.Species.values
X = data[data.columns.difference(['Species'])].as_matrix()
beta = np.random.normal(size=4) # Here I simulate random coefficients from a normal 0,1 

$f(\beta) = L(\beta) + \lambda ||\beta||_1$ where $L(\beta) = \sum\limits_{i=1}^nlog(1+e^{-y_i\beta^Tx_i})$

In [5]:
def loss_function(beta):
    y_beta_x = np.multiply(Y,X.dot(beta))
    return sum(np.log(1+np.exp(-y_beta_x)))

In [6]:
def penalty(regularization):
    return regularization*np.linalg.norm(beta, ord=1)

In [7]:
# This is f(beta)
def functional(beta,regularization):
    return loss_function(beta)+penalty(regularization)

__Algorithm 3:__ Line search procedure:

1. If $\alpha = 1$ yields sufficient relative decrease in the objective, return $\alpha=1$

2. Find $\alpha_{init} = argmin_{\delta<\alpha<1} \text{  } f(\beta+\alpha\Delta\beta)$, $\delta >0$

3. Armijo rule: Let $\alpha$ be the largest element of the sequence $\{\alpha_{init}b^j\}_{j=0,1,...}$ satisfying

<h1><center>
$f(\beta+\alpha\Delta\beta) \leq f(\beta)+\alpha\sigma D$
</center></h1>

where $0<b<1$, $0<\sigma<1$, $0\leq\gamma<1$, and

<h1><center>
$D = \nabla L(\beta)^T \Delta\beta+\gamma\Delta\beta^T\tilde{H}\Delta\beta+\lambda(||\beta-\Delta\beta||_1-||\beta||_1)$
</center></h1>

return $\alpha$

What is difficult here is to compute D. In what follows, I explain how is computed $\nabla L(\beta)$ and $\tilde{H}$:

- $\nabla L(\beta) = \frac{dL(\beta)}{d\beta} = (\frac{dL(\beta)}{d\beta_1}, \frac{dL(\beta)}{d\beta_2},...) \text{  avec  } L(\beta) = \sum\limits_{i=1}^nlog(1+e^{-y_i\beta^Tx_i}) = \sum\limits_{i=1}^nlog(1+e^{-y_i(\beta_{1i}x_{1i}+\beta_{2i}x_{2i}+...)})$

So it comes that $\nabla L(\beta) = \bigg(\sum\limits_{i=1}^n\frac{-y_ix_{1i}e^{-y_i(\beta_{1i}x_{1i}+\beta_{2i}x_{2i}+...)}}{1+e^{-y_i(\beta_{1i}x_{1i}+\beta_{2i}x_{2i}+...)}}, \sum\limits_{i=1}^n\frac{-y_ix_{2i}e^{-y_i(\beta_{1i}x_{1i}+\beta_{2i}x_{2i}+...)}}{1+e^{-y_i(\beta_{1i}x_{1i}+\beta_{2i}x_{2i}+...)}},...\bigg) = \bigg(\sum\limits_{i=1}^n\frac{-y_ix_{1i}e^{-y_i\beta^Tx_i}}{1+e^{-y_i\beta^Tx_i}}, \sum\limits_{i=1}^n\frac{-y_ix_{2i}e^{-y_i\beta^Tx_i}}{1+e^{-y_i\beta^Tx_i}},...\bigg)$

In [8]:
def gradient_loss(beta):
    gradient = []
    for j in range(len(beta)):
        y_beta_x = Y*X.dot(beta)
        y_xj = Y*X[:,j]
        gradient_betaj = sum((-y_xj*np.exp(-y_beta_x))/(1+np.exp(-y_beta_x)))
        gradient.append(gradient_betaj)
    return np.array(gradient)

- $\tilde{H}$ is such that $(\tilde{H})_{jl} =
    \begin{cases}
    (\nabla^2L(\beta))_{jl},& \text{if } \exists m:j,l \in S_m\\
    0,              & \text{otherwise}
    \end{cases}$

To compute the $\tilde{H}$ matrix, we will suppose a partitioning over 2 machines. The partitionning is supposed to be given by a dictionnary.

In [26]:
partition = {1:['SepalLengthCm', 'SepalWidthCm'], 2:['PetalLengthCm', 'PetalWidthCm']}

To do this computation, we need first to define the second derivative of the loss between two variables $j$ and $l$. We will differenciate two cases: if $j \neq l$ and $j = l$.

    -  Case where j is different from l:
$(\nabla^2L(\beta))_{jl} = \sum\limits_{i=1}^n\frac{y_i^2x_{ji}x_{li}e^{-y_i\beta^Tx_i}(1+e^{-y_i\beta^Tx_i}) + y_ix_{li}e^{-y_i\beta^Tx_i}(-y_ix_{ji}e^{-y_i\beta^Tx_i})}{(1+e^{-y_i\beta^Tx_i})^2}$

In [None]:
def second_gradient_loss(beta,j,l):
    y_beta_x = Y*X.dot(beta)
    ysq_xj_xl = Y**2*X[:,j]*X[:,l])
    gradient_betajl = sum((ysq_xj_xl*np.exp(-y_beta_x)*(1+np.exp(-y_beta_x)) + )/(1+np.exp(-y_beta_x)))
    return gradient_betajl

In [38]:
Y

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [39]:
X[:,2]

array([ 5.1,  4.9,  4.7,  4.6,  5. ,  5.4,  4.6,  5. ,  4.4,  4.9,  5.4,
        4.8,  4.8,  4.3,  5.8,  5.7,  5.4,  5.1,  5.7,  5.1,  5.4,  5.1,
        4.6,  5.1,  4.8,  5. ,  5. ,  5.2,  5.2,  4.7,  4.8,  5.4,  5.2,
        5.5,  4.9,  5. ,  5.5,  4.9,  4.4,  5.1,  5. ,  4.5,  4.4,  5. ,
        5.1,  4.8,  5.1,  4.6,  5.3,  5. ,  7. ,  6.4,  6.9,  5.5,  6.5,
        5.7,  6.3,  4.9,  6.6,  5.2,  5. ,  5.9,  6. ,  6.1,  5.6,  6.7,
        5.6,  5.8,  6.2,  5.6,  5.9,  6.1,  6.3,  6.1,  6.4,  6.6,  6.8,
        6.7,  6. ,  5.7,  5.5,  5.5,  5.8,  6. ,  5.4,  6. ,  6.7,  6.3,
        5.6,  5.5,  5.5,  6.1,  5.8,  5. ,  5.6,  5.7,  5.7,  6.2,  5.1,
        5.7,  6.3,  5.8,  7.1,  6.3,  6.5,  7.6,  4.9,  7.3,  6.7,  7.2,
        6.5,  6.4,  6.8,  5.7,  5.8,  6.4,  6.5,  7.7,  7.7,  6. ,  6.9,
        5.6,  7.7,  6.3,  6.7,  7.2,  6.2,  6.1,  6.4,  7.2,  7.4,  7.9,
        6.4,  6.3,  6.1,  7.7,  6.3,  6.4,  6. ,  6.9,  6.7,  6.9,  5.8,
        6.8,  6.7,  6.7,  6.3,  6.5,  6.2,  5.9])

In [40]:
Y*X[:,2]

array([-5.1, -4.9, -4.7, -4.6, -5. , -5.4, -4.6, -5. , -4.4, -4.9, -5.4,
       -4.8, -4.8, -4.3, -5.8, -5.7, -5.4, -5.1, -5.7, -5.1, -5.4, -5.1,
       -4.6, -5.1, -4.8, -5. , -5. , -5.2, -5.2, -4.7, -4.8, -5.4, -5.2,
       -5.5, -4.9, -5. , -5.5, -4.9, -4.4, -5.1, -5. , -4.5, -4.4, -5. ,
       -5.1, -4.8, -5.1, -4.6, -5.3, -5. ,  7. ,  6.4,  6.9,  5.5,  6.5,
        5.7,  6.3,  4.9,  6.6,  5.2,  5. ,  5.9,  6. ,  6.1,  5.6,  6.7,
        5.6,  5.8,  6.2,  5.6,  5.9,  6.1,  6.3,  6.1,  6.4,  6.6,  6.8,
        6.7,  6. ,  5.7,  5.5,  5.5,  5.8,  6. ,  5.4,  6. ,  6.7,  6.3,
        5.6,  5.5,  5.5,  6.1,  5.8,  5. ,  5.6,  5.7,  5.7,  6.2,  5.1,
        5.7,  6.3,  5.8,  7.1,  6.3,  6.5,  7.6,  4.9,  7.3,  6.7,  7.2,
        6.5,  6.4,  6.8,  5.7,  5.8,  6.4,  6.5,  7.7,  7.7,  6. ,  6.9,
        5.6,  7.7,  6.3,  6.7,  7.2,  6.2,  6.1,  6.4,  7.2,  7.4,  7.9,
        6.4,  6.3,  6.1,  7.7,  6.3,  6.4,  6. ,  6.9,  6.7,  6.9,  5.8,
        6.8,  6.7,  6.7,  6.3,  6.5,  6.2,  5.9])

In [41]:
np.multiply(Y,X[:,2])

array([-5.1, -4.9, -4.7, -4.6, -5. , -5.4, -4.6, -5. , -4.4, -4.9, -5.4,
       -4.8, -4.8, -4.3, -5.8, -5.7, -5.4, -5.1, -5.7, -5.1, -5.4, -5.1,
       -4.6, -5.1, -4.8, -5. , -5. , -5.2, -5.2, -4.7, -4.8, -5.4, -5.2,
       -5.5, -4.9, -5. , -5.5, -4.9, -4.4, -5.1, -5. , -4.5, -4.4, -5. ,
       -5.1, -4.8, -5.1, -4.6, -5.3, -5. ,  7. ,  6.4,  6.9,  5.5,  6.5,
        5.7,  6.3,  4.9,  6.6,  5.2,  5. ,  5.9,  6. ,  6.1,  5.6,  6.7,
        5.6,  5.8,  6.2,  5.6,  5.9,  6.1,  6.3,  6.1,  6.4,  6.6,  6.8,
        6.7,  6. ,  5.7,  5.5,  5.5,  5.8,  6. ,  5.4,  6. ,  6.7,  6.3,
        5.6,  5.5,  5.5,  6.1,  5.8,  5. ,  5.6,  5.7,  5.7,  6.2,  5.1,
        5.7,  6.3,  5.8,  7.1,  6.3,  6.5,  7.6,  4.9,  7.3,  6.7,  7.2,
        6.5,  6.4,  6.8,  5.7,  5.8,  6.4,  6.5,  7.7,  7.7,  6. ,  6.9,
        5.6,  7.7,  6.3,  6.7,  7.2,  6.2,  6.1,  6.4,  7.2,  7.4,  7.9,
        6.4,  6.3,  6.1,  7.7,  6.3,  6.4,  6. ,  6.9,  6.7,  6.9,  5.8,
        6.8,  6.7,  6.7,  6.3,  6.5,  6.2,  5.9])

In [37]:
np.multiply(Y**2,X[:,2])

array([ 5.1,  4.9,  4.7,  4.6,  5. ,  5.4,  4.6,  5. ,  4.4,  4.9,  5.4,
        4.8,  4.8,  4.3,  5.8,  5.7,  5.4,  5.1,  5.7,  5.1,  5.4,  5.1,
        4.6,  5.1,  4.8,  5. ,  5. ,  5.2,  5.2,  4.7,  4.8,  5.4,  5.2,
        5.5,  4.9,  5. ,  5.5,  4.9,  4.4,  5.1,  5. ,  4.5,  4.4,  5. ,
        5.1,  4.8,  5.1,  4.6,  5.3,  5. ,  7. ,  6.4,  6.9,  5.5,  6.5,
        5.7,  6.3,  4.9,  6.6,  5.2,  5. ,  5.9,  6. ,  6.1,  5.6,  6.7,
        5.6,  5.8,  6.2,  5.6,  5.9,  6.1,  6.3,  6.1,  6.4,  6.6,  6.8,
        6.7,  6. ,  5.7,  5.5,  5.5,  5.8,  6. ,  5.4,  6. ,  6.7,  6.3,
        5.6,  5.5,  5.5,  6.1,  5.8,  5. ,  5.6,  5.7,  5.7,  6.2,  5.1,
        5.7,  6.3,  5.8,  7.1,  6.3,  6.5,  7.6,  4.9,  7.3,  6.7,  7.2,
        6.5,  6.4,  6.8,  5.7,  5.8,  6.4,  6.5,  7.7,  7.7,  6. ,  6.9,
        5.6,  7.7,  6.3,  6.7,  7.2,  6.2,  6.1,  6.4,  7.2,  7.4,  7.9,
        6.4,  6.3,  6.1,  7.7,  6.3,  6.4,  6. ,  6.9,  6.7,  6.9,  5.8,
        6.8,  6.7,  6.7,  6.3,  6.5,  6.2,  5.9])

In [36]:
np.multiply(Y**2,X[:,2])

array([ 5.1,  4.9,  4.7,  4.6,  5. ,  5.4,  4.6,  5. ,  4.4,  4.9,  5.4,
        4.8,  4.8,  4.3,  5.8,  5.7,  5.4,  5.1,  5.7,  5.1,  5.4,  5.1,
        4.6,  5.1,  4.8,  5. ,  5. ,  5.2,  5.2,  4.7,  4.8,  5.4,  5.2,
        5.5,  4.9,  5. ,  5.5,  4.9,  4.4,  5.1,  5. ,  4.5,  4.4,  5. ,
        5.1,  4.8,  5.1,  4.6,  5.3,  5. ,  7. ,  6.4,  6.9,  5.5,  6.5,
        5.7,  6.3,  4.9,  6.6,  5.2,  5. ,  5.9,  6. ,  6.1,  5.6,  6.7,
        5.6,  5.8,  6.2,  5.6,  5.9,  6.1,  6.3,  6.1,  6.4,  6.6,  6.8,
        6.7,  6. ,  5.7,  5.5,  5.5,  5.8,  6. ,  5.4,  6. ,  6.7,  6.3,
        5.6,  5.5,  5.5,  6.1,  5.8,  5. ,  5.6,  5.7,  5.7,  6.2,  5.1,
        5.7,  6.3,  5.8,  7.1,  6.3,  6.5,  7.6,  4.9,  7.3,  6.7,  7.2,
        6.5,  6.4,  6.8,  5.7,  5.8,  6.4,  6.5,  7.7,  7.7,  6. ,  6.9,
        5.6,  7.7,  6.3,  6.7,  7.2,  6.2,  6.1,  6.4,  7.2,  7.4,  7.9,
        6.4,  6.3,  6.1,  7.7,  6.3,  6.4,  6. ,  6.9,  6.7,  6.9,  5.8,
        6.8,  6.7,  6.7,  6.3,  6.5,  6.2,  5.9])

In [None]:
def H_tilde():

In [29]:
H_tilde = np.empty((len(beta),len(beta),))
H_tilde[:] = np.nan

In [30]:
H_tilde

array([[ nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan],
       [ nan,  nan,  nan,  nan]])

In [18]:
i_upper = np.triu_indices(5, 1)
i_lower = np.tril_indices(5, -1)
matrix[i_upper] = matrix[i_lower]

In [9]:
# Suppose the following H_tilde matrix:
H_tilde = np.array([[2,3,0,0],[3,4,0,0],[0,0,1,4],[0,0,4,17]])
H_tilde

array([[ 2,  3,  0,  0],
       [ 3,  4,  0,  0],
       [ 0,  0,  1,  4],
       [ 0,  0,  4, 17]])

Therefore, one might compute D using:

In [10]:
def D(regularization):
    return gradient_loss(beta).dot(delta_B) + gamma*delta_B.dot(H_tilde).dot(delta_B) + regularization*(np.linalg.norm(beta-delta_B, ord=1)-np.linalg.norm(beta, ord=1))

Now, here is out line search function:

In [32]:
from scipy import optimize

def line_search(regularization, delta_B, sufficient_decrease, delta, b, sigma, gamma):
    if functional(beta,regularization) - functional(beta+delta_B,regularization) > sufficient_decrease:
        return 1
    else:
        result = optimize.minimize_scalar(lambda x: functional(beta+x*delta_B,regularization), bounds=(delta,1), method='Bounded')
        alpha_in = result.x
        alpha = alpha_in
        count = 0
        while functional(beta+alpha*delta_B,regularization) > functional(beta,regularization) + alpha*sigma*D(regularization):
            count += 1
            alpha *= b**count
    return alpha

In [33]:
# Set
regularization = 3
delta_B = np.array([.9,1,3,4])
sufficient_decrease = 1000
delta = 0.2
b = .5
sigma = .01
gamma = 0

In [34]:
line_search(regularization, delta_B, sufficient_decrease, delta, b, sigma, gamma)

5.5512366814618351e-18