# Estimation of Mixture Proportions SCIPI

## Introduction

This notebook is to provide a runnable example of SCIPI in python3.

We first talk about the data set. $L$ is a matrix of size $n\times m$ with $n=2000$ and $m = 200$.
For convenience, we will call it "large" data set.

In [1]:
# the data set is from Kim (2018)
# https://arxiv.org/abs/1806.01412
# https://cran.r-project.org/web/packages/mixsqp/index.html

## Example: larger size data

### Load Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

### Load Sourcecode

In [3]:
import sys, os
sys.path.append(os.path.join(os.path.dirname(sys.argv[1]), '..', 'src'))

In [4]:
import mixprop

### Set Random Seed

In [5]:
today_num = int(pd.Timestamp.today().date().strftime("%Y%m%d"))
offset = 0
print(f"our seed is {today_num + offset}")
np.random.seed(today_num + offset)

our seed is 20230906


In [None]:
np.random.seed(20230906)

### Set Size

We will read the data using relative path

In [6]:
our_dtype = 'float32'
L = pd.read_csv("../data/manuscript-simdata-n=2000-m=200.csv").iloc[:,1:]
n,m = L.shape
print(L.shape)

(2000, 200)


In [7]:
# We resize the scale of V_orig.
# This is not requirede but to ease the objective calculation.
# V_orig is our target matrix to be decomposed

### Matrix L

In [8]:
L = L.values.astype(our_dtype)
L[:4,:4]

array([[1.        , 0.9954726 , 0.9952412 , 0.9949981 ],
       [1.        , 0.99575776, 0.9955408 , 0.99531287],
       [1.        , 0.9967981 , 0.99663395, 0.99646145],
       [1.        , 0.99503773, 0.9947843 , 0.994518  ]], dtype=float32)

### Weight w

In [9]:
w = np.ones(n) / n
w = w.astype(our_dtype)

### Run Methods

#### MU (Multiplicative Updates)

In [10]:
temp_time = time.time()
res_mu = mixprop.run_subprob_mu(L, w)
time_mu = time.time() - temp_time
print(f"took {time_mu} seconds")

round 500: obj 0.20479034746264727
round 1000: obj 0.204564600204316
round 1500: obj 0.2044742109840914
round 2000: obj 0.20442259103195382
round 2500: obj 0.20438806590985514
round 3000: obj 0.2043628847609376
round 3500: obj 0.2043434984199647
round 4000: obj 0.20432800647451183
round 4500: obj 0.20431527685911013
round 5000: obj 0.20430458205533414
round 5500: obj 0.20429542780061047
round 6000: obj 0.20428746404325746
round 6500: obj 0.20428043491273273
round 7000: obj 0.20427414879614827
round 7500: obj 0.20426845949801886
round 8000: obj 0.20426325386063887
round 8500: obj 0.2042584433324347
round 9000: obj 0.20425395804806237
round 9500: obj 0.20424974256289186
round 10000: obj 0.2042457527103046
took 22.678959846496582 seconds


In [11]:
temp_time = time.time()
res_mu_with_normalize = mixprop.run_subprob_mu_with_normalize(L, w)
time_mu_with_normalize = time.time() - temp_time
print(f"took {time_mu_with_normalize} seconds")

round 500: obj 0.20479037793236862
round 1000: obj 0.2045646306028691
round 1500: obj 0.20447424134445685
round 2000: obj 0.20442262137012995
round 2500: obj 0.20438809623322698
round 3000: obj 0.20436291507350357
round 3500: obj 0.20434352872418582
round 4000: obj 0.2043280367720441
round 4500: obj 0.2043153071511427
round 5000: obj 0.20430461234275948
round 5500: obj 0.20429545808412147
round 6000: obj 0.20428749432340412
round 6500: obj 0.20428046518995951
round 7000: obj 0.20427417907081868
round 7500: obj 0.20426848977043133
round 8000: obj 0.20426328413104136
round 8500: obj 0.20425847360103216
round 9000: obj 0.2042539883150254
round 9500: obj 0.20424977282836193
round 10000: obj 0.20424578297439958
took 22.209836959838867 seconds


In [12]:
# run_mu is running mu without rescaling every round
# run_mu_with_normalize is running mu with rescaling every round
# they are visually the same
# however one is a little slower due to rescaling
# the other is a little numerically instable
# however for this example they are nearly identical

#### SCIPI (Scale Invariant Power Iteration)

In [13]:
temp_time = time.time()
res_scipi = mixprop.run_subprob_scipi(L, w)
time_scipi = time.time() - temp_time
print(f"took {time_scipi} seconds")

round 500: obj 0.20456443838524307
round 1000: obj 0.2044225552543237
round 1500: obj 0.20436287891757027
round 2000: obj 0.2043280132384721
round 2500: obj 0.20430459549833885
round 3000: obj 0.20428748144359923
round 3500: obj 0.204274168691858
round 4000: obj 0.20426327539307404
round 4500: obj 0.2042539806862937
round 5000: obj 0.20424577611678002
round 5500: obj 0.20423833997779287
round 6000: obj 0.204231467629592
round 6500: obj 0.20422503008628057
round 7000: obj 0.20421894803693275
round 7500: obj 0.20421317483695353
round 8000: obj 0.2042076850042545
round 8500: obj 0.20420246627014466
round 9000: obj 0.20419751403877723
round 9500: obj 0.20419282754874463
round 10000: obj 0.2041884072757886
took 22.190831661224365 seconds


#### PGD (Projected Gradient Descent)

In [14]:
# we have 2 versions of PGD
# fixed step size or learning rate
# armijo backtracking linesearch

# armijo backtracking linesearch is very expensive in this case
# it's hard to find a good "sufficient decrease parameter"
# otherwise it will mostly reject candidate steps
# see below
# as discussed in Kim (2018), the problem itself is slow when it's solved by first order approach

In [15]:
temp_time = time.time()
res_pgd_with_linesearch = mixprop.run_subprob_pgd_with_linesearch(L, w, num_iter = 10000, init_stepsize = 0.1)
time_pgd_with_linesearch = time.time() - temp_time
print(f"took {time_pgd_with_linesearch} seconds")

round 500: obj 0.4822700765973368
round 1000: obj 0.48048812239221483
round 1500: obj 0.47871475389526064
round 2000: obj 0.476949898436676
round 2500: obj 0.47519348434967473
round 3000: obj 0.4734454409530697
round 3500: obj 0.47170569853424726
round 4000: obj 0.46997418833253457
round 4500: obj 0.46825084252292803
round 5000: obj 0.46653559420019536
round 5500: obj 0.46482837736332683
round 6000: obj 0.46312912690033337
round 6500: obj 0.461437778573376
round 7000: obj 0.45975426900423744
round 7500: obj 0.45808114899272323
round 8000: obj 0.4564176974201452
round 8500: obj 0.4547618451679148
round 9000: obj 0.4531135327772947
round 9500: obj 0.4514727015698666
round 10000: obj 0.4498392936350206
took 182.5590476989746 seconds


In [16]:
# 0.01 leads slow convergence

In [17]:
temp_time = time.time()
res_pgd0 = mixprop.run_subprob_pgd(L, w, stepsize = 0.01)
time_pgd0 = time.time() - temp_time
print(f"took {time_pgd0} seconds")

round 500: obj 0.20470468834986555
round 1000: obj 0.20455065419064833
round 1500: obj 0.20448484947346912
round 2000: obj 0.20444704931654176
round 2500: obj 0.2044230846648181
round 3000: obj 0.20440185458805699
round 3500: obj 0.20438837078456767
round 4000: obj 0.20437852012191657
round 4500: obj 0.20437019684405947
round 5000: obj 0.2043622649173854
round 5500: obj 0.20435504553402317
round 6000: obj 0.20434845881968747
round 6500: obj 0.20434198114811253
round 7000: obj 0.20433628261008469
round 7500: obj 0.20433076047553755
round 8000: obj 0.20432610061254974
round 8500: obj 0.20432195428005484
round 9000: obj 0.20431819550961766
round 9500: obj 0.20431517311410152
round 10000: obj 0.20431221783408965
took 22.58869504928589 seconds


In [18]:
# 0.012 leads fast convergence

In [19]:
temp_time = time.time()
res_pgd1 = mixprop.run_subprob_pgd(L, w, stepsize = 0.012)
time_pgd1 = time.time() - temp_time
print(f"took {time_pgd1} seconds")

round 500: obj 0.20462748858999116
round 1000: obj 0.20450261631271488
round 1500: obj 0.20444845235803638
round 2000: obj 0.20441675978871657
round 2500: obj 0.20439491262688586
round 3000: obj 0.20437885083449028
round 3500: obj 0.20436663085630907
round 4000: obj 0.204356764107831
round 4500: obj 0.20434849283083364
round 5000: obj 0.20434146043601628
round 5500: obj 0.20433538737947912
round 6000: obj 0.2043300508097442
round 6500: obj 0.20432527244354642
round 7000: obj 0.20432092167416827
round 7500: obj 0.2043171417716046
round 8000: obj 0.20431366597680506
round 8500: obj 0.20431059380817945
round 9000: obj 0.20430776696038433
round 9500: obj 0.20430513971333403
round 10000: obj 0.20430269717681981
took 22.64771008491516 seconds


In [20]:
# 0.015 leads zigzagging

In [21]:
temp_time = time.time()
res_pgd2 = mixprop.run_subprob_pgd(L, w, stepsize = 0.015)
time_pgd2 = time.time() - temp_time
print(f"took {time_pgd2} seconds")

round 500: obj 0.22904123660132858
round 1000: obj 0.22962515270914377
round 1500: obj 0.23063687323483
round 2000: obj 0.22931571877812684
round 2500: obj 0.22972275064474557
round 3000: obj 0.22998382759242314
round 3500: obj 0.23019709003898975
round 4000: obj 0.2303210358773703
round 4500: obj 0.23098582244172117
round 5000: obj 0.2311409029234871
round 5500: obj 0.22997914915781986
round 6000: obj 0.22974607232524108
round 6500: obj 0.2297435971054721
round 7000: obj 0.2298668659741232
round 7500: obj 0.22997012812219675
round 8000: obj 0.2300429865452976
round 8500: obj 0.23013818989517182
round 9000: obj 0.2301898973060279
round 9500: obj 0.23026530016333924
round 10000: obj 0.230312152312721
took 22.542251110076904 seconds
