In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.corpus import inaugural

In [2]:
df = pd.DataFrame()

i=0
for fileid in inaugural.fileids()[0:10]:
    df = df.append({'docid':i, 'doc':inaugural.raw(fileid)}, ignore_index=True)
    i += 1

In [3]:
df.head()

Unnamed: 0,doc,docid
0,Fellow-Citizens of the Senate and of the House...,0.0
1,"Fellow citizens, I am again called upon by the...",1.0
2,"When it was first perceived, in early times, t...",2.0
3,Friends and Fellow Citizens:\n\nCalled upon to...,3.0
4,"Proceeding, fellow citizens, to that qualifica...",4.0


### scikit-learn contains a very convenient TF-IDF function

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

V = TfidfVectorizer()
X = V.fit_transform(df['doc'])

### and also a function for computing the SVD

In [5]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

In [6]:
svd.fit(X)

TruncatedSVD(n_components=5, n_iter=7, random_state=42)

**With the SVD computed, we can now extract the factors**

`svd.transform(X)` produces $U * \Sigma$

In [7]:
svd.transform(X)

array([[ 0.93463347, -0.01492144, -0.03610824,  0.31191945,  0.06088167],
       [ 0.76700655,  0.63903337,  0.01255701, -0.03550425,  0.03707685],
       [ 0.94174323, -0.04659484, -0.17967291, -0.03786259, -0.16746802],
       [ 0.94617395, -0.06802826, -0.17601907, -0.10595164,  0.00550266],
       [ 0.93915482, -0.12237826, -0.07266183, -0.13246871,  0.25376191],
       [ 0.94993958, -0.04975374,  0.0081433 ,  0.06040607, -0.01287225],
       [ 0.93172153, -0.08190089,  0.26375008, -0.12132008, -0.06448393],
       [ 0.96293912, -0.07971455,  0.07947081,  0.00791192,  0.01137386],
       [ 0.95911734, -0.07054314,  0.12833825,  0.05638333,  0.02770982],
       [ 0.95928001,  0.01560924, -0.02604711, -0.01045235, -0.14216591]])

`svd.components_` generates $V^T$

In [8]:
svd.components_

array([[ 0.00272336,  0.00054467,  0.00159054, ...,  0.00106196,
         0.00199141,  0.00070456],
       [-0.00384686, -0.00076937, -0.00048768, ..., -0.00160627,
        -0.00219143, -0.00112016],
       [ 0.01935716,  0.00387143, -0.00326409, ...,  0.00591679,
         0.00398704,  0.00308875],
       [ 0.00921755,  0.00184351,  0.03056171, ...,  0.00185049,
         0.00716281,  0.0003333 ],
       [ 0.00554277,  0.00110855,  0.00729878, ...,  0.00144075,
        -0.00051662,  0.00058626]])

`svd.singular_values_` generates $\Sigma$ in vector form

In [9]:
svd.singular_values_

array([2.94344053, 0.67165411, 0.40385824, 0.38791748, 0.35069133])

### Under the covers, there is another way to generate the factors explicitly

We call the not-so-well-known function `randomized_svd()`

In [10]:
from sklearn.utils.extmath import randomized_svd

U, Sigma, VT = randomized_svd(X, 
                              n_components=10,
                              n_iter=5,
                              random_state=None)

In [11]:
U, Sigma, VT

(array([[ 0.31753095, -0.02221595, -0.08940821,  0.80408714,  0.17360472,
         -0.4090113 , -0.18264982,  0.00332914,  0.06311966,  0.09667276],
        [ 0.26058164,  0.95143224,  0.03109261, -0.09152527,  0.10572502,
         -0.01850966,  0.04258036,  0.01670277, -0.06237652,  0.00519603],
        [ 0.31994641, -0.06937326, -0.44489104, -0.09760475, -0.47753681,
         -0.29494653,  0.53096516,  0.2851214 , -0.08390036, -0.00863578],
        [ 0.3214517 , -0.10128467, -0.4358437 , -0.27312933,  0.01569088,
         -0.06215974, -0.4975021 , -0.40225353, -0.43287484, -0.14506535],
        [ 0.31906703, -0.18220429, -0.17991915, -0.34148684,  0.72360473,
         -0.0636217 ,  0.13631377,  0.19898271,  0.3594392 ,  0.03146489],
        [ 0.32273102, -0.07407643,  0.02016376,  0.15571887, -0.03670535,
          0.59755067, -0.23425276,  0.62546142, -0.23775825, -0.06843249],
        [ 0.31654165, -0.12193908,  0.65307588, -0.31274715, -0.18387661,
         -0.4649036 , -0.2563676

In [12]:
svd.fit_transform(X) / svd.singular_values_, svd.singular_values_, svd.components_


(array([[ 0.31753095, -0.02221595, -0.08940821,  0.80408714,  0.17360472],
        [ 0.26058164,  0.95143224,  0.03109261, -0.09152527,  0.10572502],
        [ 0.31994641, -0.06937326, -0.44489104, -0.09760475, -0.47753681],
        [ 0.3214517 , -0.10128467, -0.4358437 , -0.27312933,  0.01569088],
        [ 0.31906703, -0.18220429, -0.17991915, -0.34148684,  0.72360473],
        [ 0.32273102, -0.07407643,  0.02016376,  0.15571887, -0.03670535],
        [ 0.31654165, -0.12193908,  0.65307588, -0.31274715, -0.18387661],
        [ 0.32714747, -0.11868392,  0.19677898,  0.02039589,  0.03243267],
        [ 0.32584906, -0.10502896,  0.31778043,  0.14534878,  0.07901485],
        [ 0.32590433,  0.02324   , -0.06449569, -0.02694479, -0.40538758]]),
 array([2.94344053, 0.67165411, 0.40385824, 0.38791748, 0.35069133]),
 array([[ 0.00272336,  0.00054467,  0.00159054, ...,  0.00106196,
          0.00199141,  0.00070456],
        [-0.00384686, -0.00076937, -0.00048768, ..., -0.00160627,
         -