# Linear Programming to solve for best weights

## To use our linear programing implementation, use:

In [2]:
from sparsecca._multicca_pmd import lp_pmd

In [3]:
import pandas as pd
import numpy as np
import pyomo.environ as pyo
from scipy.linalg import svd
from collections import defaultdict


In [4]:
# example input
mcca1 = pd.read_csv("../data/multicca1.csv", sep=",")
mcca2 = pd.read_csv("../data/multicca2.csv", sep=",")

In [5]:
mcca1

Unnamed: 0.1,Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5
0,AAACCGTGCTTCCG,-0.334611,-0.973064,-0.075584,-0.15361,-0.851994
1,AAACGCTGTTTCTG,5.558206,-3.939912,-6.719958,5.287234,-0.592665
2,AAAGAGACGCGAGA,-1.413016,1.226474,2.840675,0.558084,4.779749
3,AAAGCAGATATCGG,-6.217906,-0.855824,4.287036,-3.52227,2.55883
4,AAAGTTTGTAGCGT,5.138953,1.219816,-0.190048,-0.195437,1.369196
5,AAATCAACCCTATT,-8.643385,-1.846813,-0.738421,5.892154,-7.573796
6,AAATCATGACCACA,-9.683405,-6.973077,-6.825142,5.18311,0.245244
7,AAATGTTGAACGAA,1.918918,1.183611,2.569566,-2.589635,1.729954
8,AAATGTTGTGGCAT,-2.764088,-0.959742,1.838465,-2.350117,2.576929
9,AAATTCGAATCACG,6.243531,1.419469,1.213018,1.077085,-1.268777


In [6]:
# get values only from datsets
datasets = [mcca1.iloc[:,1:7].values, mcca2.iloc[:,1:6].values]

In [7]:
weights,_ = lp_pmd(datasets=datasets, penalties=[1.5, 1.5], K=3, standardize=True, mimic_R=True)

Model unknown

  Variables:
    w_i_f : Size=10, Index=w_i_f_index
        Key    : Lower : Value                : Upper : Fixed : Stale : Domain
        (0, 0) :  None :  0.07040724965597965 :  None : False : False :  Reals
        (0, 1) :  None :  -0.6977999020370866 :  None : False : False :  Reals
        (0, 2) :  None :   -0.592616752530863 :  None : False : False :  Reals
        (0, 3) :  None :  0.39099294023035963 :  None : False : False :  Reals
        (0, 4) :  None :   0.0636241383741811 :  None : False : False :  Reals
        (1, 0) :  None : 0.003951963934108804 :  None : False : False :  Reals
        (1, 1) :  None :  -0.5278884744303985 :  None : False : False :  Reals
        (1, 2) :  None :  -0.3084078577776999 :  None : False : False :  Reals
        (1, 3) :  None :  0.15567065669290509 :  None : False : False :  Reals
        (1, 4) :  None :   0.7758668637153872 :  None : False : False :  Reals

  Objectives:
    Obj : Size=1, Index=None, Active=True
       

In [8]:
weights

array([[[ 0.07040725, -0.02355747, -0.31516197],
        [-0.6977999 , -0.73198297, -0.26364387],
        [-0.59261675, -0.56641844, -0.35330535],
        [ 0.39099294,  0.37425835, -0.71362539],
        [ 0.06362414,  0.05241022, -0.44393579]],

       [[ 0.00395196,  0.23770445, -0.20905306],
        [-0.52788847, -0.47329805, -0.74249448],
        [-0.30840786, -0.10075966, -0.3441507 ],
        [ 0.15567066,  0.2685864 , -0.33016619],
        [ 0.77586686,  0.79824458, -0.42136606]]])

# This Notebook is examplatory for the linear programing approach with K=1.

## Parameters

In [9]:
# Parameters 
standardize = True
mimic_R = True
# penalties need to have the same length as datasets
penalties = [1, 1]
K = 1

## Prepare data

In [10]:
from sparsecca._utils_pmd import scale

In [11]:
# preprocess data
datasets = datasets.copy()
# at least 2 features are needed
for data in datasets:
    if len(data[0]) < 2:
        raise Exception('Need at least 2 features in each dataset')

# standardize if set TRUE
if standardize:
    for idx in range(len(datasets)):
        if mimic_R:
            datasets[idx] = scale(datasets[idx], center=True, scale=True)
        else:
            datasets[idx] = scale(datasets[idx], center=True, scale=False)
        datasets[idx] = datasets[idx].tolist()


## Create Model

In [12]:
model = pyo.ConcreteModel()

## Sets

In [13]:
model.N = pyo.Set(initialize=range(len(datasets)))
model.S = pyo.Set(initialize=range(len(datasets[0])))
model.F = pyo.Set(initialize=range(len(datasets[0][0])))
model.X = pyo.Set(initialize=datasets) 

## Parameters

In [14]:
# params: ci i in [1:K]
model.c = pyo.Param(model.N, initialize=penalties)

## Variables

In [15]:
# get initial weigths with svd
ws_init = []
for idx in range(len(datasets)):
    ws_init.append(svd(datasets[idx])[2][0:K].T)


In [16]:
ws_init

[array([[-0.09944126],
        [-0.51134763],
        [-0.54861213],
        [ 0.55350196],
        [-0.34827481]]),
 array([[ 0.54545036],
        [-0.55050983],
        [ 0.50197444],
        [ 0.01276177],
        [ 0.38377289]])]

In [17]:
model.w_i_f = pyo.Var(model.N, model.F,initialize=0.5)
for n in range(len(ws_init)):
    for f in range(len(ws_init[0])):
        model.w_i_f[n,f].value = ws_init[n][f][0]

## Objective

\begin{array}{lll}
    \max_u,_v u^TX^TYv\\
\end{array}

In [18]:
def ObjRule(model):
    """Objective Function (4.3 in witten 2009)"""
    features = len(model.F.data())
    samples = len(model.S.data())
    return sum(
                (np.asarray([model.w_i_f[idx, f] for f in model.F.data()])[np.newaxis]
               @ np.asarray(xi).reshape(samples,features).T 
               @ np.asarray(xj).reshape(samples,features)
               @ np.asarray([model.w_i_f[jdx, f] for f in model.F.data()])[np.newaxis].T)[0,0] 
               for idx, xi in enumerate(model.X) for jdx, xj in enumerate(model.X) if idx<jdx )
        

In [19]:
# Objective
model.Obj = pyo.Objective(rule=ObjRule, sense=pyo.maximize)

## Constraints

\begin{align}
\sum_{i=1}^n |u_{i}| \leq c_1\\
\end{align}




In [20]:
"""Constraint Lasso (2.3 in witten 2009)"""
# P1(u) = sum_{i=1}^n |u_{i}| <= c_1
model.constraint_lasso = pyo.ConstraintList()
for i in model.N:
    model.constraint_lasso.add(sum(model.w_i_f[i,f] for f in model.F.data())<= model.c[i])

\begin{align}
||v||^2_2=1
\end{align}

In [21]:
"""Constraint 2norm (2.3 in witten 2009)"""
# ||v||^2 2 = 1
model.constraint_norm2 = pyo.ConstraintList()
for i in model.N:
    model.constraint_norm2.add(sum(model.w_i_f[i,f] * model.w_i_f[i,f] for f in model.F.data()) <= 1)


## Solve with ipopt

In [22]:
nonLinearOpt =pyo.SolverFactory('ipopt')
instance_non_linear = model.create_instance()
res = nonLinearOpt.solve(instance_non_linear)
model.solutions.load_from(res)

In [23]:
instance_non_linear.display()

Model unknown

  Variables:
    w_i_f : Size=10, Index=w_i_f_index
        Key    : Lower : Value                : Upper : Fixed : Stale : Domain
        (0, 0) :  None :  0.07040724964484998 :  None : False : False :  Reals
        (0, 1) :  None :  -0.6977999020332945 :  None : False : False :  Reals
        (0, 2) :  None :  -0.5926167525490866 :  None : False : False :  Reals
        (0, 3) :  None :  0.39099294021568715 :  None : False : False :  Reals
        (0, 4) :  None :  0.06362413834854513 :  None : False : False :  Reals
        (1, 0) :  None : 0.003951963883473098 :  None : False : False :  Reals
        (1, 1) :  None :  -0.5278884744772177 :  None : False : False :  Reals
        (1, 2) :  None : -0.30840785783375524 :  None : False : False :  Reals
        (1, 3) :  None :  0.15567065664938798 :  None : False : False :  Reals
        (1, 4) :  None :   0.7758668636702594 :  None : False : False :  Reals

  Objectives:
    Obj : Size=1, Index=None, Active=True
       

In [24]:
instance_non_linear.display()

Model unknown

  Variables:
    w_i_f : Size=10, Index=w_i_f_index
        Key    : Lower : Value                : Upper : Fixed : Stale : Domain
        (0, 0) :  None :  0.07040724964484998 :  None : False : False :  Reals
        (0, 1) :  None :  -0.6977999020332945 :  None : False : False :  Reals
        (0, 2) :  None :  -0.5926167525490866 :  None : False : False :  Reals
        (0, 3) :  None :  0.39099294021568715 :  None : False : False :  Reals
        (0, 4) :  None :  0.06362413834854513 :  None : False : False :  Reals
        (1, 0) :  None : 0.003951963883473098 :  None : False : False :  Reals
        (1, 1) :  None :  -0.5278884744772177 :  None : False : False :  Reals
        (1, 2) :  None : -0.30840785783375524 :  None : False : False :  Reals
        (1, 3) :  None :  0.15567065664938798 :  None : False : False :  Reals
        (1, 4) :  None :   0.7758668636702594 :  None : False : False :  Reals

  Objectives:
    Obj : Size=1, Index=None, Active=True
       

In [25]:
w = defaultdict(list)
for i in model.N:
    for f in model.F.data():
            w[i].append(instance_non_linear.w_i_f[i,f].value) 
weigths_k1 = []
for i in range(len(w)):
    weigths_k1.append(np.asarray(w[i]).reshape(len(model.F.data()),1))# 1 cause K is one in this example


## Results

In [26]:
weigths_k1

[array([[ 0.07040725],
        [-0.6977999 ],
        [-0.59261675],
        [ 0.39099294],
        [ 0.06362414]]),
 array([[ 0.00395196],
        [-0.52788847],
        [-0.30840786],
        [ 0.15567066],
        [ 0.77586686]])]