# Oja's Algorithm

## The following cell imports the .db file when run

In [1]:
!ls ../../Data/K_cluster_analysis.db

../../Data/K_cluster_analysis.db


In [2]:
import pandas as pd
import numpy as np
from sklearn import decomposition
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.cluster import KMeans
import glob
%pylab inline

Populating the interactive namespace from numpy and matplotlib


From [the fast onvergence of incremental PCA](https://arxiv.org/pdf/1501.03796.pdf)

![](figures/OnlinePCAEquations.png)

### Explanation of Oja's update

* $V_n$ - the estimate of the top eigenvector at iteration $n$
* $\gamma_n$ - learning rate.
* $X_n$ The $n$th example
* $X_n X_n^T V_{n-1} = X_n (X_n \cdot V_{n-1})$

In [4]:
%%time
#run this to import
try:
    import dill
except:
    %pip install dill
    import dill
dill.load_session('../../Data/K_cluster_analysis.db')
data=data[:,1:]  # remove prefix '1'
data.shape

CPU times: user 225 ms, sys: 642 ms, total: 867 ms
Wall time: 901 ms


(2664671, 12)

### Estimate mean

## Calculating the regular mean onine

$\mu_n = \frac{\sum_{i=1}^n a_i}{n}$

$\frac{n}{n+1} \mu_n + \frac{a_{n+1}}{n+1}$

Equivalent to setting $\eta=1/(n+1)$

In [5]:
_mean=np.zeros(data.shape[1])
eta=0.001

In [8]:
means=[]
for i in range(data.shape[0]):
    vector=data[i,:]
    _not_nan=~isnan(vector)
    _newmean=copy(_mean)
    eta=1/(i+1)
    _newmean[_not_nan] = (1-eta)*_mean[_not_nan] + eta*vector[_not_nan]
    if i%100000 == 0:
        print(i,norm(_mean-_newmean))
    _mean=_newmean
    means.append(_mean)

0 0.5951372064291786
100000 1.5690481591144585e-06
200000 3.2628223800674106e-07
300000 7.521542888137032e-08
400000 5.9789299840198e-08
500000 2.842112320005564e-08
600000 1.9892053532317562e-08
700000 2.2275906975915026e-08
800000 6.883765363888882e-08
900000 3.81451475294294e-08
1000000 9.455594422086783e-09
1100000 5.4015295425196334e-08
1200000 6.015604081127205e-08
1300000 2.900176538142261e-08
1400000 1.228059522900131e-08
1500000 3.903372062654489e-09
1600000 1.3636588843548614e-08
1700000 3.156327280264699e-08
1800000 3.316731985293332e-08
1900000 2.616893720235214e-07
2000000 6.801978644860196e-07
2100000 6.27873574438885e-07
2200000 1.9012922248931373e-07
2300000 2.2561234614329645e-08
2400000 3.3484416174482445e-08
2500000 6.097180171021407e-07
2600000 4.517066511529012e-08


In [9]:
k=100000
for i in range(0,len(means)-k,k):
    print(i,norm(means[i]-means[i+k]))

0 0.1447404149041009
100000 0.05378261620813265
200000 0.012658681717191447
300000 0.0031600419773093794
400000 0.000749695666280752
500000 0.0017961292526633115
600000 0.001384093437557272
700000 0.0010145837780274702
800000 0.0009613488926847077
900000 0.0032236041391800288
1000000 0.005887319361255275
1100000 0.003930985142360634
1200000 0.002545977841568042
1300000 0.0006030245716472474
1400000 0.0009689868028682781
1500000 0.00035340473480815283
1600000 0.0028256053213221027
1700000 0.004054150005473617
1800000 0.00593420310389188
1900000 0.031098926275841224
2000000 0.025563120599316817
2100000 0.03614930408244452
2200000 0.01959457682007723
2300000 0.002785004480648088
2400000 0.014020684482405553
2500000 0.01962945793467195


In [10]:
norm(means[-1]-means[int(len(means)/2)])

0.13017125416122374

In [12]:
mean_last=means[-1]
norm(mean_last)

1.7898567401736574

In [20]:
eta*vector[_not_nan]

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

In [33]:
_not_nan=~isnan(vector)

In [34]:
_not_nan

array([False, False, False, False, False, False,  True,  True,  True,
       False, False, False])

In [35]:
vector

array([   nan,    nan,    nan,    nan,    nan,    nan, -0.012, -0.   ,
        1.   ,    nan,    nan,    nan])

In [37]:
vector[_not_nan]

array([-0.012, -0.   ,  1.   ])