_Last Updated: 06/21/2021_

### MNIST General Formulation

__Parameters:__

$ x_{n,d}: \textrm{binary vector inputs of size n} \times \textrm{d, where n is the number of data points, d is the number of dimensions/features} $ 

$ y_{n,j}: \textrm{ binary  vector labeled outputs of size n} \times \textrm{j, where N is the number of data points and J is the dimension of the output (for MNIST, J = 4)} $

__Decision Variables:__

$ \alpha_{d,k,0}: \textrm{Weight for feature d in unit k in the zero-th hidden layer,} \; \forall\;d\in D,\;k \in K $

$ \alpha_{k\prime,k,l}: \textrm{Weight from the } k\prime^{th} \textrm{ unit in layer l-1 to unit k in layer l,} \; \forall\;k\prime,\;k \in K,\;l \in \{1,2,3,...,L-1\} $

$ \alpha_{k\prime,j,L}: \textrm{Weight from the } k\prime^{th} \textrm{ unit in hidden layer L-1 to unit output layer L,} \; \forall\;\;k\prime \in K,\; j \in J $

$ \beta_{k,l}: \textrm{Bias for unit k in layer l,} \; \forall\;k \in K,\;l\in \{0,1,...,L-1\} $

$ \beta_{j,L}: \textrm{Bias for unit j in the final layer, }\; \forall \; j \in J $

$ h_{n,k,l}: \textrm{Binary output of unit k in layer l,} \; \forall\;n \in N, \;k \in K,\;l \in \{0,1,...L-1\} $

$ g_{n,k\prime,k,l}: \textrm{Artificial Variable for decomposition,} \; \forall\;n \in N,\; k, k\prime \in K,\; l \in \{1,...,L-1\} $

$ g_{n,k\prime,j,L}: \textrm{Artificial Variable for decomposition of the output layer,} \; \forall\;n \in N,\; k\prime \in K, \; j \in J $

$ z_{n,k\prime,k,l}: \textrm{Auxilliary variable that represents } \alpha_{k\prime,k,l}g_{n,k\prime,k,l-1} \; \forall\;n \in N, \;k\prime,\;k \in K,\;l \in \{1,2,...,L-1\} $

$ z_{n,k\prime,j,L}: \textrm{Auxilliary variable that represents } \alpha_{k\prime,j,L}g_{n,k\prime,j,L} \; \forall\;n \in N, \;k\prime,\;k \in K,\; j \in J $

$ \hat{y}_{n,j}: \textrm{Output of final layer,} \; \forall\;n \in N,\; j \in J $

$ \ell\prime_{n,j}: \textrm{Absolute difference of paired-elements in the output vector}, \; \forall\;n \in N,\; j \in J $

$ \ell_{n}: \textrm{Misclassification of data point n,} \; \forall\;n \in N $

__Objective:__

$\displaystyle \min_{\alpha,\beta,h,g,z,\hat{y},\ell\prime,\ell} \; \displaystyle  \sum_{n=0}^{N} \ell_{n} $

__Constraints:__

$ \textrm{subject to} \quad \displaystyle \sum_{k\prime=0}^{K} (z_{n,k\prime,j,L}) + \beta_{j,L} \le (M+\epsilon)\hat{y}_{n,j}, \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{k\prime=0}^{K} (z_{n,k\prime,j,L}) + \beta_{j,L} \ge \epsilon + (m-\epsilon)(1-\hat{y}_{n,j}), \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \ell\prime_{n,j} \ge y_{n,j} - \hat{y}_{n,j}, \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \ell\prime_{n,j} \ge -y_{n,j} + \hat{y}_{n,j}, \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{j=0}^{J} \ell\prime_{n,j} \le 1 - \epsilon + (M + \epsilon)\ell_{n}, \; \forall \; n $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{d=0}^{D} (\alpha_{d,k,0}x_{n,d}) + \beta_{k,0} \le (M+\epsilon)h_{n,k,0}, \; \forall \;n,k $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{d=0}^{D} (\alpha_{d,k,0}x_{n,d}) + \beta_{k,0} \ge \epsilon + (m-\epsilon)(1-h_{n,k,0}), \; \forall \; n,k $

$ \quad\quad\quad\quad\ \displaystyle  \sum_{k\prime=0}^{K} (z_{n,k\prime,k,l}) + \beta_{k,l} \le (M+\epsilon)h_{n,k,l}, \; \forall \;n,k,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\ \displaystyle  \sum_{k\prime=0}^{K} (z_{n,k\prime,k,l}) + \beta_{k,l}  \ge \epsilon + (m-\epsilon)(1-h_{n,k,l}), \; \forall \; n,k,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \le \alpha_{k\prime,k,l} + M(1-g_{n,k\prime,k,l}), \; \forall\;n,k\prime,k,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \ge \alpha_{k\prime,k,l} +  m(1-g_{n,k\prime,k,l}), \; \forall\;n,k,k\prime,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \le Mg_{n,k\prime,k,l}, \; \forall\;n,k,k\prime,l\in\;\{1,2,...,L-1\}$

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \ge mg_{n,k\prime,k,l}, \; \forall\;n,k,k\prime,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; g_{n,k\prime,k,l} = h_{n,k\prime,(l-1)}, \; \forall \; n,k\prime,k,l \in \{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \le \alpha_{k\prime,j,L} + M(1-g_{n,k\prime,j,L}), \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \ge \alpha_{k\prime,j,L} + m(1-g_{n,k\prime,j,L}), \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \le Mg_{n,k\prime,j,L}, \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \ge mg_{n,k\prime,j,L}, \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; g_{n,k\prime,j,L} = h_{n,k\prime,(L-1)}, \; \forall \; n,k\prime,j $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \alpha_{d,k,0},\;\alpha_{k\prime,k,l},\;\alpha_{k\prime,j,L} \le Upper\;Bound, \; \forall \; d,k,k\prime,l\in\;\{1,2,...,L-1\} $ 

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \;z_{n,k\prime,k,l},\;z_{n,k\prime,j,L} \le Upper\;Bound, \; \forall \; n,k,k\prime,l\in\;\{1,2,...,L-1\} $ 

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \beta_{k,l},\; \beta_{j,L} \le Upper\;Bound, \; \forall \; k,j,l\in\;\{0,1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; 0 \le \ell\prime_{n,j} \le 1, \; \forall \; n \in N, \; j \in J $

$ \quad\quad\quad\quad\;\; g_{n,k\prime,k,l},g_{n,k\prime,j,L} \in \{0,1\}, \; \forall \; n,k\prime,k,j,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; \hat{y}_{n,j},h_{n,k,l}, \ell_{n} \in \{0,1\}, \; \forall \; n,k,l\in\;\{0,1,...,L-1\} $


### MNIST Lagrangian Relaxation and Dual

$ \zeta_{LR}(\lambda) = \displaystyle \min_{\alpha,\beta,h,g,z,\hat{y},\ell\prime,\ell} \;
\sum_{n=0}^{N} ( \ell_{n} + \sum_{k\prime=0}^{K} ( \sum_{k=0}^{K} ( \sum_{l=1}^{L-1} (
    \lambda_{n,k\prime,k,l}(g_{n,k\prime,k,l} - h_{n,k\prime,(l-1)}))) + \sum_{j=0}^{J}( \lambda_{n,k\prime,j,L}(g_{n,k\prime,j,L} - h_{n,k\prime,(L-1)})))) $

$ \textrm{subject to} \quad \displaystyle \sum_{k\prime=0}^{K} (z_{n,k\prime,j,L}) + \beta_{j,L} \le (M+\epsilon)\hat{y}_{n,j}, \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{k\prime=0}^{K} (z_{n,k\prime,j,L}) + \beta_{j,L} \ge \epsilon + (m-\epsilon)(1-\hat{y}_{n,j}), \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \ell\prime_{n,j} \ge y_{n,j} - \hat{y}_{n,j}, \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \ell\prime_{n,j} \ge -y_{n,j} + \hat{y}_{n,j}, \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{j=0}^{J} \ell\prime_{n,j} \le 1 - \epsilon + (M + \epsilon)\ell_{n}, \; \forall \; n $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{d=0}^{D} (\alpha_{d,k,0}x_{n,d}) + \beta_{k,0} \le (M+\epsilon)h_{n,k,0}, \; \forall \;n,k $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{d=0}^{D} (\alpha_{d,k,0}x_{n,d}) + \beta_{k,0} \ge \epsilon + (m-\epsilon)(1-h_{n,k,0}), \; \forall \; n,k $

$ \quad\quad\quad\quad\ \displaystyle  \sum_{k\prime=0}^{K} (z_{n,k\prime,k,l}) + \beta_{k,l} \le (M+\epsilon)h_{n,k,l}, \; \forall \;n,k,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\ \displaystyle  \sum_{k\prime=0}^{K} (z_{n,k\prime,k,l}) + \beta_{k,l}  \ge \epsilon + (m-\epsilon)(1-h_{n,k,l}), \; \forall \; n,k,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \le \alpha_{k\prime,k,l} + M(1-g_{n,k\prime,k,l}), \; \forall\;n,k\prime,k,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \ge \alpha_{k\prime,k,l} +  m(1-g_{n,k\prime,k,l}), \; \forall\;n,k\prime,k,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \le Mg_{n,k\prime,k,l}, \; \forall\;n,k,k\prime,l\in\;\{1,2,...,L-1\}$

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \ge mg_{n,k\prime,k,l}, \; \forall\;n,k,k\prime,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \le \alpha_{k\prime,j,L} + M(1-g_{n,k\prime,j,L}), \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \ge \alpha_{k\prime,j,L} + m(1-g_{n,k\prime,j,L}), \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \le Mg_{n,k\prime,j,L}, \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \ge mg_{n,k\prime,j,L}, \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \alpha_{d,k,0},\;\alpha_{k\prime,k,l},\;\alpha_{k\prime,j,L} \le Upper\;Bound, \; \forall \; d,k,k\prime,l\in\;\{1,2,...,L-1\} $ 

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \;z_{n,k\prime,k,l},\;z_{n,k\prime,j,L} \le Upper\;Bound, \; \forall \; n,k,k\prime,l\in\;\{1,2,...,L-1\} $ 

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \beta_{k,l},\; \beta_{j,L} \le Upper\;Bound, \; \forall \; k,j,l\in\;\{0,1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; 0 \le \ell\prime_{n,j} \le 1, \; \forall \; n \in N, \; j \in J $

$ \quad\quad\quad\quad\;\; g_{n,k\prime,k,l},g_{n,k\prime,j,L} \in \{0,1\}, \; \forall \; n,k\prime,k,j,l\in\;\{1,2,...,L-1\} $

$ \quad\quad\quad\quad\;\; \hat{y}_{n,j},h_{n,k,l}, \ell_{n} \in \{0,1\}, \; \forall \; n,k,j,l\in\;\{0,1,...,L-1\} $


### Subproblems

$ \zeta_{0}(\lambda) = \displaystyle \min_{\alpha,\beta,h} \; 
\sum_{n=0}^{N} (\sum_{k\prime=0}^{K} (\sum_{k=0}^{K} (-\lambda_{n,k\prime,k,1}h_{n,k\prime,0}))) $
                  
$ \textrm{subject to} \quad \displaystyle \sum_{d=0}^{D} (\alpha_{d,k,0}x_{n,d}) + \beta_{k,0} \le (M+\epsilon)h_{n,k,0}, \; \forall \;n,k $ 

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{d=0}^{D} (\alpha_{d,k,0}x_{n,d}) + \beta_{k,0} \ge \epsilon + (m-\epsilon)(1-h_{n,k,0}), \; \forall \; n,k $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \alpha_{d,k,0} \le Upper\;Bound, \; \forall \; d,k $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \beta_{k,0} \le Upper\;Bound, \; \forall \; k $

$ \quad\quad\quad\quad\;\; h_{n,k,0}  \in \{0,1\}, \; \forall \; n,k $


$ \zeta_{l}(\lambda) = \displaystyle \min_{\alpha,\beta,z,h,g} \; 
\sum_{n=0}^{N} (\sum_{k\prime=0}^{K} (\sum_{k=0}^{K} ( \lambda_{n,k\prime,k,l}g_{n,k\prime,k,l} - \lambda_{n,k\prime,k,l+1}h_{n,k\prime,l}))) $
                  
$ \textrm{subject to} \quad \displaystyle  \sum_{k\prime=0}^{K} (z_{n,k\prime,k,l}) + \beta_{k,l} \le (M+\epsilon)h_{n,k,l}, \; \forall \;n,k $

$ \quad\quad\quad\quad\ \displaystyle  \sum_{k\prime=0}^{K} (z_{n,k\prime,k,l}) + \beta_{k,l}  \ge \epsilon + (m-\epsilon)(1-h_{n,k,l}), \; \forall \; n,k $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \le \alpha_{k\prime,k,l} + M(1-g_{n,k\prime,k,l}), \; \forall\;n,k\prime,k $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \ge \alpha_{k\prime,k,l} +  m(1-g_{n,k\prime,k,l}), \; \forall\;n,k\prime,k $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \le Mg_{n,k\prime,k,l}, \; \forall\;n,k\prime,k $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,l} \ge mg_{n,k\prime,k,l}, \; \forall\;n,k\prime,k $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \alpha_{k\prime,k,l} \le Upper\;Bound, \; \forall \; k\prime, k $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le z_{n,k\prime,k,l} \le Upper\;Bound, \; \forall \; n,k,k\prime $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \beta_{k,l} \le Upper\;Bound, \; \forall \; k $

$ \quad\quad\quad\quad\;\; h_{n,k,l},g_{n,k\prime,k,l} \in \{0,1\}, \; \forall \; n,k,k\prime $


$ \zeta_{L-1}(\lambda) = \displaystyle \min_{\alpha,\beta,z,h,g} \; 
\sum_{n=0}^{N} (\sum_{k\prime=0}^{K} (\sum_{k=0}^{K} ( \lambda_{n,k\prime,k,L-1}g_{n,k\prime,k,L-1}) - \sum_{j=0}^{J}( \lambda_{n,k\prime,j,L}h_{n,k\prime,L-1}))) $
                  
$ \textrm{subject to} \quad \displaystyle  \sum_{k\prime=0}^{K} (z_{n,k\prime,k,L-1}) + \beta_{k,L-1} \le (M+\epsilon)h_{n,k,L-1}, \; \forall \;n,k $

$ \quad\quad\quad\quad\ \displaystyle  \sum_{k\prime=0}^{K} (z_{n,k\prime,k,L-1}) + \beta_{k,L-1}  \ge \epsilon + (m-\epsilon)(1-h_{n,k,L-1}), \; \forall \; n,k $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,L-1} \le \alpha_{k\prime,k,L-1} + M(1-g_{n,k\prime,k,L-1}), \; \forall\;n,k\prime,k $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,L-1} \ge \alpha_{k\prime,k,L-1} +  m(1-g_{n,k\prime,k,L-1}), \; \forall\;n,k\prime,k $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,L-1} \le Mg_{n,k\prime,k,L-1}, \; \forall\;n,k\prime,k $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,k,L-1} \ge mg_{n,k\prime,k,L-1}, \; \forall\;n,k\prime,k $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \alpha_{k\prime,k,L-1} \le Upper\;Bound, \; \forall \; k\prime, k $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le z_{n,k\prime,k,L-1} \le Upper\;Bound, \; \forall \; n,k,k\prime $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \beta_{k,L-1} \le Upper\;Bound, \; \forall \; k $

$ \quad\quad\quad\quad\;\; h_{n,k,L-1}, g_{n,k\prime,k,L-1} \in \{0,1\}, \; \forall \; n,k,k\prime $


$ \zeta_{L}(\lambda) = \displaystyle \min_{\alpha,\beta,z,g,\hat{y},\ell\prime,\ell} \; 
\sum_{n=0}^{N} (\ell_{n} + \sum_{k\prime=0}^{K} ( \sum_{j=0}^{J} (\lambda_{n,k\prime,j,L}g_{n,k\prime,j,L}))) $
                  
$ \textrm{subject to} \quad \displaystyle \sum_{k\prime=0}^{K} (z_{n,k\prime,j,L}) + \beta_{j,L} \le (M+\epsilon)\hat{y}_{n,j}, \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{k\prime=0}^{K} (z_{n,k\prime,j,L}) + \beta_{j,L} \ge \epsilon + (m-\epsilon)(1-\hat{y}_{n,j}), \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \ell\prime_{n,j} \ge y_{n,j} - \hat{y}_{n,j}, \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \ell\prime_{n,j} \ge -y_{n,j} + \hat{y}_{n,j}, \; \forall \; n,j $

$ \quad\quad\quad\quad\;\; \displaystyle \sum_{j=0}^{J} \ell\prime_{n,j} \le 1 - \epsilon + (M + \epsilon)\ell_{n}, \; \forall \; n $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \le \alpha_{k\prime,j,L} + M(1-g_{n,k\prime,j,L}), \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \ge \alpha_{k\prime,j,L} + m(1-g_{n,k\prime,j,L}), \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \le Mg_{n,k\prime,j,L}, \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; z_{n,k\prime,j,L} \ge mg_{n,k\prime,j,L}, \; \forall\;n,k\prime,j $

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \alpha_{k\prime,j,L} \le Upper\;Bound, \; \forall \; k\prime,j $ 

$ \quad\quad\quad\quad\;\; Lower\;Bound \le z_{n,k\prime,j,L} \le Upper\;Bound, \; \forall \; n,k\prime,j $ 

$ \quad\quad\quad\quad\;\; Lower\;Bound \le \beta_{j,L} \le Upper\;Bound,\; \forall \; j \in J $

$ \quad\quad\quad\quad\;\; 0 \le \ell\prime_{n,j} \le 1, \; \forall \; n \in N, \; j \in J $

$ \quad\quad\quad\quad\;\; \hat{y}_{n,j},g_{n,k\prime,j,L},\ell_{n} \in \{0,1\}, \; \forall \; n,k\prime,j $

$ z_{\mathcal{L}} = \displaystyle \max_{\lambda \in \mathbb{R}^{m_{1}}} 
\zeta_{0}(\lambda) + \sum_{l=1}^{L-2}(\zeta_{l}(\lambda)) + \zeta_{L-1}(\lambda) + \zeta_{L}(\lambda) $


***