# Linear Programming to solve for best weights

To use our linear programing implementation, use:

In [2]:
from sparsecca.sparsecca._multicca_pmd import lp_pmd

ImportError: cannot import name 'multicca_LA' from 'sparsecca._multicca_pmd' (/workspaces/sparsecca/tests/sparsecca/sparsecca/_multicca_pmd.py)

In [None]:
weights = lp_pmd(datasets, penalties, K, standadize, mimic_R)

### This Notebook is examplatory for the linear programing approach with K=1.

In [2]:
import pandas as pd
import numpy as np
import pyomo.environ as pyo
from scipy.linalg import svd
from collections import defaultdict


In [3]:
# example input
mcca1 = pd.read_csv("../tests/data/multicca1.csv", sep=",")
mcca2 = pd.read_csv("../tests/data/multicca2.csv", sep=",")

In [39]:
mcca1

Unnamed: 0.1,Unnamed: 0,PC_1,PC_2,PC_3,PC_4,PC_5
0,AAACCGTGCTTCCG,-0.334611,-0.973064,-0.075584,-0.15361,-0.851994
1,AAACGCTGTTTCTG,5.558206,-3.939912,-6.719958,5.287234,-0.592665
2,AAAGAGACGCGAGA,-1.413016,1.226474,2.840675,0.558084,4.779749
3,AAAGCAGATATCGG,-6.217906,-0.855824,4.287036,-3.52227,2.55883
4,AAAGTTTGTAGCGT,5.138953,1.219816,-0.190048,-0.195437,1.369196
5,AAATCAACCCTATT,-8.643385,-1.846813,-0.738421,5.892154,-7.573796
6,AAATCATGACCACA,-9.683405,-6.973077,-6.825142,5.18311,0.245244
7,AAATGTTGAACGAA,1.918918,1.183611,2.569566,-2.589635,1.729954
8,AAATGTTGTGGCAT,-2.764088,-0.959742,1.838465,-2.350117,2.576929
9,AAATTCGAATCACG,6.243531,1.419469,1.213018,1.077085,-1.268777


In [7]:
# get values only from datsets
datasets = [mcca1.iloc[:,1:7].values, mcca2.iloc[:,1:6].values]

## Parameters

In [4]:
# Parameters 
standardize = True
mimic_R = True
# penalties need to have the same length as datasets
penalties = [1, 1]
K = 1

## Prepare data

In [5]:
from sparsecca._utils_pmd import scale

In [8]:
# preprocess data
datasets = datasets.copy()
# at least 2 features are needed
for data in datasets:
    if len(data[0]) < 2:
        raise Exception('Need at least 2 features in each dataset')

# standardize if set TRUE
if standardize:
    for idx in range(len(datasets)):
        if mimic_R:
            datasets[idx] = scale(datasets[idx], center=True, scale=True)
        else:
            datasets[idx] = scale(datasets[idx], center=True, scale=False)
        datasets[idx] = datasets[idx].tolist()


## Create Model

In [9]:
model = pyo.ConcreteModel()

## Sets

In [10]:
model.Idx = pyo.Set(initialize=range(len(datasets)))
model.samples = pyo.Set(initialize=range(len(datasets[0])))
model.PC = pyo.Set(initialize=range(len(datasets[0][0])))
model.X = pyo.Set(initialize=datasets) 

## Parameters

In [11]:
# params: ci i in [1:K]
model.c = pyo.Param(model.Idx, initialize=penalties)

## Variables

In [12]:
model.w_i_f = pyo.Var(model.Idx, model.PC, bounds=(0, 1), initialize=0.5)

## Objective

\begin{array}{lll}
    \max_u,_v u^TX^TYv\\
\end{array}

In [13]:
def ObjRule(model):
    """Objective Function (4.3 in witten 2009)"""
    features = len(model.PC.data())
    samples = len(model.samples.data())
    #TODO: array from  w_i_k (for all pcs)
    return sum(
                (np.asarray([model.w_i_f[idx, f] for f in model.PC.data()])[np.newaxis]
               @ np.asarray(xi).reshape(samples,features).T 
               @ np.asarray(xj).reshape(samples,features)
               @ np.asarray([model.w_i_f[jdx, f] for f in model.PC.data()])[np.newaxis].T)[0,0] 
               for idx, xi in enumerate(model.X) for jdx, xj in enumerate(model.X) if idx<jdx )
        

In [14]:
# Objective
model.Obj = pyo.Objective(rule=ObjRule, sense=pyo.maximize)

## Constraints

\begin{align}
\sum_{i=1}^n |u_{i}| \leq c_1\\
\end{align}




In [15]:
"""Constraint Lasso (2.3 in witten 2009)"""
# P1(u) = sum_{i=1}^n |u_{i}| <= c_1
model.constraint_lasso = pyo.ConstraintList()
for i in model.Idx:
    model.constraint_lasso.add(sum(model.w_i_f[i,f] for f in model.PC.data())<= model.c[i])

\begin{align}
||v||^2_2=1
\end{align}

In [16]:
"""Constraint 2norm (2.3 in witten 2009)"""
# ||v||^2 2 = 1
model.constraint_norm2 = pyo.ConstraintList()
for i in model.Idx:
    model.constraint_norm2.add(sum(model.w_i_f[i,f] * model.w_i_f[i,f] for f in model.PC.data()) <= 1)


## Solve with ipopt

In [17]:
nonLinearOpt =pyo.SolverFactory('ipopt')
instance_non_linear = model.create_instance()
res = nonLinearOpt.solve(instance_non_linear)
model.solutions.load_from(res)

In [18]:
instance_non_linear.display()

Model unknown

  Variables:
    w_i_f : Size=10, Index=w_i_f_index
        Key    : Lower : Value                  : Upper : Fixed : Stale : Domain
        (0, 0) :     0 : -9.729863030215721e-09 :     1 : False : False :  Reals
        (0, 1) :     0 :     1.0000000010348558 :     1 : False : False :  Reals
        (0, 2) :     0 : 3.7802751064230926e-08 :     1 : False : False :  Reals
        (0, 3) :     0 : -9.762962402631453e-09 :     1 : False : False :  Reals
        (0, 4) :     0 : -9.725500983668512e-09 :     1 : False : False :  Reals
        (1, 0) :     0 : -9.349768803722753e-09 :     1 : False : False :  Reals
        (1, 1) :     0 :     1.0000000044297266 :     1 : False : False :  Reals
        (1, 2) :     0 : 3.3399276598717075e-08 :     1 : False : False :  Reals
        (1, 3) :     0 : -9.564260026516956e-09 :     1 : False : False :  Reals
        (1, 4) :     0 : -9.830330306028888e-09 :     1 : False : False :  Reals

  Objectives:
    Obj : Size=1, Index=Non

In [21]:
w = defaultdict(list)
for i in model.Idx:
    for f in model.PC.data():
            w[i].append(instance_non_linear.w_i_f[i,f].value) 
weigths_k1 = []
for i in range(len(w)):
    weigths_k1.append(np.asarray(w[i]).reshape(len(model.PC.data()),1))# 1 cause K is one in this example


## Results

In [22]:
weigths_k1

[array([[-9.72986303e-09],
        [ 1.00000000e+00],
        [ 3.78027511e-08],
        [-9.76296240e-09],
        [-9.72550098e-09]]),
 array([[-9.34976880e-09],
        [ 1.00000000e+00],
        [ 3.33992766e-08],
        [-9.56426003e-09],
        [-9.83033031e-09]])]