In [2]:
import numpy as np
import pandas as pd
import numpy.linalg as la
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

def df(x):
    return pd.DataFrame(x).round(3)

# small number example

## SVD

n_user by n_movie

In [3]:
X = np.array([
    [4, 4, 0, 0],
    [3, 3, 0, 0],
    [5, 5, 0, 0],
    [0, 0, 3, 3],
    [0, 0, 2, 2],
    [0, 0, 5, 5],
])
m, n = X.shape
df(X)

Unnamed: 0,0,1,2,3
0,4,4,0,0
1,3,3,0,0
2,5,5,0,0
3,0,0,3,3
4,0,0,2,2
5,0,0,5,5


In [4]:
U, s, Vh = la.svd(X, full_matrices=False)
Sigma = np.diag(s)

U.shape, s.shape, Sigma.shape, Vh.shape

((6, 4), (4,), (4, 4), (4, 4))

In [5]:
np.allclose(X, U * s @ Vh)

True

In [6]:
k = 2

Compare the 2 below

In [7]:
df(U * s)

Unnamed: 0,0,1,2,3
0,-5.657,0.0,0.0,-0.0
1,-4.243,0.0,-0.0,-0.0
2,-7.071,0.0,-0.0,0.0
3,0.0,-4.243,0.0,0.0
4,0.0,-2.828,0.0,0.0
5,0.0,-7.071,0.0,-0.0


In [8]:
df(U[:,:k] * s[:k])

Unnamed: 0,0,1
0,-5.657,0.0
1,-4.243,0.0
2,-7.071,0.0
3,0.0,-4.243
4,0.0,-2.828
5,0.0,-7.071


Compare the 2 below

In [9]:
df(Vh.T * s)

Unnamed: 0,0,1,2,3
0,-7.071,-0.0,-0.0,0.0
1,-7.071,-0.0,0.0,0.0
2,-0.0,-6.164,0.0,-0.0
3,-0.0,-6.164,0.0,0.0


In [10]:
df(Vh[:k].T * s[:k])

Unnamed: 0,0,1
0,-7.071,-0.0
1,-7.071,-0.0
2,-0.0,-6.164
3,-0.0,-6.164


embeddings of users

In [11]:
eb_u = U[:,:k] * s[:k]
df(eb_u)

Unnamed: 0,0,1
0,-5.657,0.0
1,-4.243,0.0
2,-7.071,0.0
3,0.0,-4.243
4,0.0,-2.828
5,0.0,-7.071


embeddings of movies

In [13]:
eb_m = Vh[:k].T
df(eb_m)

Unnamed: 0,0,1
0,-0.707,-0.0
1,-0.707,-0.0
2,-0.0,-0.707
3,-0.0,-0.707


Compare the 2 below: X = user_embedding @ movie_embedding

In [15]:
df(eb_u @ eb_m.T)

Unnamed: 0,0,1,2,3
0,4.0,4.0,0.0,0.0
1,3.0,3.0,0.0,0.0
2,5.0,5.0,0.0,0.0
3,0.0,0.0,3.0,3.0
4,0.0,0.0,2.0,2.0
5,0.0,0.0,5.0,5.0


In [16]:
df(X)

Unnamed: 0,0,1,2,3
0,4,4,0,0
1,3,3,0,0
2,5,5,0,0
3,0,0,3,3
4,0,0,2,2
5,0,0,5,5


Compare the two below: user_embedding = X @ moving_embedding

In [17]:
df(eb_u)

Unnamed: 0,0,1
0,-5.657,0.0
1,-4.243,0.0
2,-7.071,0.0
3,0.0,-4.243
4,0.0,-2.828
5,0.0,-7.071


In [19]:
df(X @ eb_m)

Unnamed: 0,0,1
0,-5.657,0.0
1,-4.243,0.0
2,-7.071,0.0
3,0.0,-4.243
4,0.0,-2.828
5,0.0,-7.071


## PCA equivalency

In [20]:
pca = PCA(n_components=k)
X_mean = X.mean(axis=0)
X_nrm = X - X_mean
df(X_nrm)

Unnamed: 0,0,1,2,3
0,2.0,2.0,-1.667,-1.667
1,1.0,1.0,-1.667,-1.667
2,3.0,3.0,-1.667,-1.667
3,-2.0,-2.0,1.333,1.333
4,-2.0,-2.0,0.333,0.333
5,-2.0,-2.0,3.333,3.333


In [21]:
U_, s_, Vh_ = la.svd(X_nrm, full_matrices=False)

Compare the 2 below: they are the embedding of users

In [22]:
df(pca.fit_transform(X_nrm))

Unnamed: 0,0,1
0,-3.68,0.12
1,-2.623,-0.82
2,-4.736,1.06
3,3.366,-0.472
4,2.426,-1.528
5,5.247,1.641


In [23]:
eb_u_ = U_[:, :k]*s_[:k]
df(eb_u_)

Unnamed: 0,0,1
0,-3.68,0.12
1,-2.623,-0.82
2,-4.736,1.06
3,3.366,-0.472
4,2.426,-1.528
5,5.247,1.641


Compare the two below: they are the embedding of movings

In [31]:
eb_m_ = Vh_[:k].T
df(eb_m_)

Unnamed: 0,0,1
0,-0.528,0.47
1,-0.528,0.47
2,0.47,0.528
3,0.47,0.528


In [32]:
df(pca.components_)

Unnamed: 0,0,1,2,3
0,-0.528,-0.528,0.47,0.47
1,0.47,0.47,0.528,0.528


Compare the 2 below: X = user_embedding @ movie_embedding

In [34]:
df(eb_u_ @ eb_m_.T)

Unnamed: 0,0,1,2,3
0,2.0,2.0,-1.667,-1.667
1,1.0,1.0,-1.667,-1.667
2,3.0,3.0,-1.667,-1.667
3,-2.0,-2.0,1.333,1.333
4,-2.0,-2.0,0.333,0.333
5,-2.0,-2.0,3.333,3.333


In [35]:
df(X_nrm)

Unnamed: 0,0,1,2,3
0,2.0,2.0,-1.667,-1.667
1,1.0,1.0,-1.667,-1.667
2,3.0,3.0,-1.667,-1.667
3,-2.0,-2.0,1.333,1.333
4,-2.0,-2.0,0.333,0.333
5,-2.0,-2.0,3.333,3.333


Compare the two below: user_embedding = X @ moving_embedding

In [36]:
df(eb_u_)

Unnamed: 0,0,1
0,-3.68,0.12
1,-2.623,-0.82
2,-4.736,1.06
3,3.366,-0.472
4,2.426,-1.528
5,5.247,1.641


In [37]:
df(X_nrm @ eb_m_)

Unnamed: 0,0,1
0,-3.68,0.12
1,-2.623,-0.82
2,-4.736,1.06
3,3.366,-0.472
4,2.426,-1.528
5,5.247,1.641


# big number example

In [38]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import numpy.linalg as la

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [39]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [40]:
vectorizer = CountVectorizer(stop_words='english', binary=True)
vectors = vectorizer.fit_transform(newsgroups_train.data).todense() # (documents, vocab)
m, n = vectors.shape
m, n

(2034, 26576)

In [41]:
newsgroups_train.target_names

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

In [42]:
vocab = np.array(vectorizer.get_feature_names())

vocab.shape

(26576,)

In [43]:
vocab[10000: 10010]

array(['factors', 'factory', 'facts', 'factsnet', 'factual', 'factually',
       'faculty', 'fade', 'fades', 'fading'], dtype='<U80')

X is n_doc by n_vocab

In [44]:
X = np.array(vectors)
U, s, Vh = la.svd(X, full_matrices=False)

In [45]:
U.shape, s.shape, Vh.shape

((2034, 2034), (2034,), (2034, 26576))

In [46]:
np.allclose(X, U * s @ Vh)

True

In [47]:
k = 300

embeddings of docs

In [48]:
eb_d = U[:,:k] * s[:k]

embeddings of vocabs

In [49]:
eb_v = Vh[:k]

In [50]:
eb_d.shape, eb_v.shape

((2034, 300), (300, 26576))

In [51]:
(eb_d @ eb_v).shape, X.shape

((2034, 26576), (2034, 26576))