# Feature dimension reduction via SVD

In [1]:
import numpy as np

## this is an example(18.4) from stanford book on lsa(latent semantic indexing or analysis) introduction:
link1: https://nlp.stanford.edu/IR-book/pdf/18lsi.pdf

### aa is an term-document matrix (rows for tokens, columns for documents)

In [68]:
aa = np.array([[1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 0, 1]])
aa

array([[1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 0, 1]])

### SVD

In [69]:
u, s, v =np.linalg.svd(aa,full_matrices=False)

In [70]:
u

array([[ 4.40347480e-01, -2.96174360e-01, -5.69497581e-01,
         5.77350269e-01, -2.46402144e-01],
       [ 1.29346349e-01, -3.31450692e-01,  5.87021697e-01,
         9.43689571e-16, -7.27197008e-01],
       [ 4.75530263e-01, -5.11115242e-01,  3.67689978e-01,
         4.51851436e-16,  6.14358412e-01],
       [ 7.03020318e-01,  3.50572409e-01, -1.54905878e-01,
        -5.77350269e-01, -1.59788154e-01],
       [ 2.62672838e-01,  6.46746769e-01,  4.14591704e-01,
         5.77350269e-01,  8.66139898e-02]])

In [71]:
s

array([2.16250096, 1.59438237, 1.27529025, 1.        , 0.39391525])

In [78]:
v

array([[ 7.48623048e-01,  2.79711603e-01,  2.03628802e-01,
         4.46563110e-01,  3.25095956e-01,  1.21467154e-01],
       [-2.86453991e-01, -5.28459139e-01, -1.85761186e-01,
         6.25520701e-01,  2.19879758e-01,  4.05640944e-01],
       [-2.79711603e-01,  7.48623048e-01, -4.46563110e-01,
         2.03628802e-01, -1.21467154e-01,  3.25095956e-01],
       [-4.16290741e-16,  1.06695915e-15,  5.77350269e-01,
         2.38203550e-16, -5.77350269e-01,  5.77350269e-01],
       [ 5.28459139e-01, -2.86453991e-01, -6.25520701e-01,
        -1.85761186e-01, -4.05640944e-01,  2.19879758e-01]])

### keep k= 2 dimention, and create diagnized Σ matrix

In [72]:
s[2:]=0
dia= np.diag(s)
dia

array([[2.16250096, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 1.59438237, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

### optimized matrix after removing last r-k components

In [75]:
optimized_matrix = u.dot(dia).dot(v)
optimized_matrix

array([[ 0.8481456 ,  0.51590232,  0.28162515,  0.12986018,  0.20574267,
        -0.07588249],
       [ 0.36077778,  0.35750764,  0.15512454, -0.20565325, -0.02526436,
        -0.18038889],
       [ 1.00327014,  0.71828543,  0.36077778, -0.05052871,  0.15512454,
        -0.20565325],
       [ 0.97800578,  0.12986018,  0.20574267,  1.0285345 ,  0.61713858,
         0.41139591],
       [ 0.12986018, -0.38604214, -0.07588249,  0.89867432,  0.41139591,
         0.4872784 ]])

### calculate dimension reduced features for each documents(here 2 features are preserved)

In [77]:
reduced_dimention = dia.dot(v)
reduced_dimention

array([[ 1.61889806,  0.60487661,  0.44034748,  0.96569316,  0.70302032,
         0.26267284],
       [-0.45671719, -0.84256593, -0.29617436,  0.99731918,  0.35057241,
         0.64674677],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ]])