In [122]:
import pandas as pd
import scipy 
import numpy as np

In [131]:
def df(a): return pd.DataFrame(np.round(a, 2))
m, n = 7, 10
X = np.random.randn(m, n)
df(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.94,1.0,1.07,-1.35,-1.15,0.02,3.25,-0.81,0.67,0.12
1,2.01,-0.6,-0.42,0.32,0.43,1.96,0.76,0.56,-1.18,0.62
2,-1.62,-1.74,0.17,-0.01,1.7,-0.5,-0.28,-0.74,0.77,-0.19
3,0.42,-0.03,0.65,-0.77,0.21,-0.4,0.1,0.23,0.09,-1.31
4,0.37,0.05,0.13,-0.8,1.2,-0.97,0.13,0.56,1.54,1.43
5,0.27,-1.95,1.03,0.63,0.94,1.29,-0.93,0.87,-0.65,-1.38
6,1.44,0.65,1.81,-0.73,-0.15,2.23,-2.04,1.08,-0.4,0.74


## Paper implementation
https://github.com/PrincetonXL/SIF/blob/master/src/SIF_embedding.py

In [132]:
from sklearn.decomposition import TruncatedSVD

In [133]:
def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_

def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    pc = compute_pc(X, npc)
    if npc==1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX


In [134]:
df(remove_pc(X))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.06,-0.76,0.78,-0.23,0.32,0.09,0.6,-0.35,0.32,-0.58
1,1.63,-0.93,-0.48,0.54,0.71,1.97,0.26,0.64,-1.24,0.48
2,-0.65,-0.88,0.32,-0.55,0.98,-0.53,1.01,-0.97,0.94,0.14
3,0.33,-0.11,0.64,-0.72,0.28,-0.39,-0.02,0.25,0.07,-1.34
4,0.19,-0.11,0.1,-0.7,1.33,-0.97,-0.11,0.6,1.51,1.37
5,1.23,-1.11,1.17,0.1,0.24,1.26,0.33,0.64,-0.49,-1.04
6,1.45,0.66,1.81,-0.74,-0.16,2.23,-2.03,1.08,-0.4,0.74


### dissect

In [135]:
trun_svd = TruncatedSVD(1)
trun_svd.fit(X)
pc = trun_svd.components_
df(pc)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.47,0.41,0.07,-0.26,-0.34,-0.01,0.62,-0.11,0.08,0.16


In [136]:
df(pc.transpose())

Unnamed: 0,0
0,0.47
1,0.41
2,0.07
3,-0.26
4,-0.34
5,-0.01
6,0.62
7,-0.11
8,0.08
9,0.16


In [137]:
X.shape, pc.shape

((7, 10), (1, 10))

In [139]:
df(X - X @ pc.T @ pc)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.06,-0.76,0.78,-0.23,0.32,0.09,0.6,-0.35,0.32,-0.58
1,1.63,-0.93,-0.48,0.54,0.71,1.97,0.26,0.64,-1.24,0.48
2,-0.65,-0.88,0.32,-0.55,0.98,-0.53,1.01,-0.97,0.94,0.14
3,0.33,-0.11,0.64,-0.72,0.28,-0.39,-0.02,0.25,0.07,-1.34
4,0.19,-0.11,0.1,-0.7,1.33,-0.97,-0.11,0.6,1.51,1.37
5,1.23,-1.11,1.17,0.1,0.24,1.26,0.33,0.64,-0.49,-1.04
6,1.45,0.66,1.81,-0.74,-0.16,2.23,-2.03,1.08,-0.4,0.74


In [148]:
df(pc.T @ pc)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.22,0.19,0.03,-0.12,-0.16,-0.01,0.29,-0.05,0.04,0.08
1,0.19,0.17,0.03,-0.11,-0.14,-0.01,0.25,-0.04,0.03,0.07
2,0.03,0.03,0.0,-0.02,-0.02,-0.0,0.04,-0.01,0.01,0.01
3,-0.12,-0.11,-0.02,0.07,0.09,0.0,-0.16,0.03,-0.02,-0.04
4,-0.16,-0.14,-0.02,0.09,0.12,0.01,-0.21,0.04,-0.03,-0.06
5,-0.01,-0.01,-0.0,0.0,0.01,0.0,-0.01,0.0,-0.0,-0.0
6,0.29,0.25,0.04,-0.16,-0.21,-0.01,0.38,-0.07,0.05,0.1
7,-0.05,-0.04,-0.01,0.03,0.04,0.0,-0.07,0.01,-0.01,-0.02
8,0.04,0.03,0.01,-0.02,-0.03,-0.0,0.05,-0.01,0.01,0.01
9,0.08,0.07,0.01,-0.04,-0.06,-0.0,0.1,-0.02,0.01,0.03


## using pca

In [144]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(X)
pc_ = pca.components_
df(pc_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.37,0.42,0.01,-0.25,-0.34,-0.13,0.65,-0.16,0.13,0.16


In [145]:
df(X - X @ pc_.T @ pc_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.37,-0.76,1.02,-0.29,0.3,0.57,0.48,-0.14,0.13,-0.55
1,1.87,-0.75,-0.43,0.42,0.56,2.01,0.52,0.61,-1.22,0.56
2,-0.94,-0.97,0.19,-0.47,1.07,-0.73,0.92,-1.03,1.0,0.09
3,0.36,-0.09,0.65,-0.73,0.27,-0.38,-0.0,0.26,0.07,-1.34
4,0.18,-0.15,0.12,-0.68,1.37,-0.91,-0.19,0.64,1.48,1.35
5,1.16,-0.95,1.06,0.03,0.13,0.98,0.63,0.48,-0.35,-1.0
6,1.68,0.93,1.81,-0.9,-0.38,2.14,-1.61,0.97,-0.31,0.85


In [147]:
df(pc - pc_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.1,-0.01,0.06,-0.01,-0.0,0.11,-0.04,0.05,-0.05,0.01


In [146]:
df(X @ pc.T @ pc - X @ pc_.T @ pc_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.43,0.0,0.25,-0.06,-0.03,0.48,-0.12,0.21,-0.19,0.03
1,0.24,0.18,0.05,-0.12,-0.15,0.04,0.26,-0.03,0.02,0.07
2,-0.29,-0.09,-0.12,0.08,0.09,-0.2,-0.09,-0.07,0.06,-0.05
3,0.03,0.01,0.01,-0.01,-0.01,0.02,0.02,0.0,-0.0,0.01
4,-0.0,-0.04,0.02,0.02,0.03,0.06,-0.08,0.04,-0.03,-0.01
5,-0.07,0.15,-0.11,-0.07,-0.11,-0.28,0.3,-0.16,0.14,0.04
6,0.24,0.27,0.01,-0.16,-0.22,-0.09,0.43,-0.1,0.08,0.1
