# SVD - Matrix Factorization

In [3]:
import pandas as pd
import numpy as np
from random import randint
from scipy.spatial.distance import cosine

## Generate Data

In [10]:
def generate_data(n_songs = 3000, n_genres = 10, n_artists = 450, n_users = 30000, n_listens = 15, dataset_size = 100000):
    '''
    This function will generate a dataset with features associated to
    song data set. The dataset will have the following columns : 
        - song_id (String) : Unique identified for the song
        - user_id (String) : Unique identifier for the user
        - song_genre (Integer) : An integer representing a genre for the song, 
                                 value is between 1 and 15, indicating that 
                                 there are 15 unique genres. Each song can only
                                 have 1 genre
        - artist_id (String) : Unique identifier for the author of the song
        - n_listen (Integer) : The number of times this user has heard the song
        - publish_year (Integer) : The year of song publishing
        
    params:
        n_songs (Integer) : The number of songs you want the dataset to have
        n_genres (Integer) : Number of genres to be chosen from
        n_artists (Integer) : Number of authors to be generated
        n_users (Integer) : Number of readers for the dataset
        n_listens (Integer) : Range of number of times a song has been heard
        dataset_size (Integer) : The number of rows to be generated 
        
    example:
        data = generate_data()
    '''
    
    d = pd.DataFrame(
        {
            'song_id' : [randint(1, n_songs) for _ in range(dataset_size)],
            'artist_id' : [randint(1, n_artists) for _ in range(dataset_size)],
            'song_genre' : [randint(1, n_genres) for _ in range(dataset_size)],
            'user_id' : [randint(1, n_users) for _ in range(dataset_size)],
            'n_listen' : [randint(1, n_listens) for _ in range(dataset_size)],
            'publish_year' : [randint(2000, 2021) for _ in range(dataset_size)]
        }
    ).drop_duplicates()
    return d
  
d = generate_data(dataset_size = 100000).drop_duplicates()
d.to_csv('data.csv', index = False)

In [12]:
d.head()

Unnamed: 0,song_id,artist_id,song_genre,user_id,n_listen,publish_year
0,1567,169,4,19060,8,2002
1,1964,381,2,28706,4,2013
2,714,84,2,27262,3,2010
3,2190,217,3,2036,10,2020
4,2483,230,4,27202,5,2012


In [29]:
Math(
    r'''
{\displaystyle \mathbf {M} =\mathbf {U\Sigma V^{T}} }
    '''
)

<IPython.core.display.Math object>

In [2]:
from IPython.display import display, Math, Latex

In [3]:
display(Math(r'F(k) = \int_{-\infty}^{\infty} f(x) e^{2\pi i k} dx'))

<IPython.core.display.Math object>

In [6]:
display(Math(
    r'$$\begin{pmatrix}    3  & -10 & 2 \\    -1 &   7 & 4 \\    5  &   0 & 1\end{pmatrix}\begin{pmatrix}    x\\ y\\ z\end{pmatrix}=\begin{pmatrix}   3x-10y+2z \\   -x+7y+4z \\   5x+z\end{pmatrix}$$'
))

<IPython.core.display.Math object>

In [43]:
Latex(r'''        $$ M*M^{T} = \begin {pmatrix}
            0  & 1 & 1 \\
            {\sqrt 2} &   2 & 0 \\
            0  & 1 & 1
        \end{pmatrix} * \begin {pmatrix}
        0 & {\sqrt 2} & 0 \\
        1 & 2 & 1 \\
        1 & 0 & 1
        
        \end {pmatrix} = 
        \begin {pmatrix}
        2 & 2 & 2 \\
        2 & 6 & 2 \\
        2 & 2 & 2 \\
        \end {pmatrix}
        $$''')

<IPython.core.display.Latex object>

In [85]:
import numpy as np

In [97]:
m = np.asarray([
    [0, 1, 1],
    [np.sqrt(2), 2, 0],
    [0, 1, 1]
])

In [98]:
sigma = np.asarray([
    [2*np.sqrt(2), 0, 0],
    [0, np.sqrt(2), 0],
    [0, 0, 0]
])

In [107]:
u = np.asarray([
    [np.sqrt(6)/6, np.sqrt(3)/3, -np.sqrt(2)/2],
    [np.sqrt(6)/3, -np.sqrt(3)/3, 0],
    [np.sqrt(6)/6, np.sqrt(3)/3, np.sqrt(2)/2]
])

In [100]:
v = np.asarray([
    [np.sqrt(6) / 6, -np.sqrt(3)/3, np.sqrt(2)/2],
    [np.sqrt(3) / 2, 0, -1/2],
    [np.sqrt(3) / 6, -np.sqrt(6)/3, 1/2]
])

In [103]:
u.dot(sigma).dot(v.T)

array([[-1.11022302e-16,  1.00000000e+00, -3.33333333e-01],
       [ 1.41421356e+00,  2.00000000e+00,  1.33333333e+00],
       [-1.11022302e-16,  1.00000000e+00, -3.33333333e-01]])

In [104]:
u * sigma * v.T

array([[ 0.47140452,  0.        , -0.        ],
       [-0.        , -0.        , -0.        ],
       [ 0.        , -0.        ,  0.        ]])

In [127]:
from numpy.linalg import svd
matrix = m
matrix = [[4,0], [3, -5]]
u2, s2, vh2 = svd(matrix, full_matrices=False)

In [140]:
u2 * s2 * vh2.T

array([[ 2.,  2.],
       [-4., -1.]])

In [122]:
u

array([[ 0.40824829,  0.57735027, -0.70710678],
       [ 0.81649658, -0.57735027,  0.        ],
       [ 0.40824829,  0.57735027,  0.70710678]])

In [123]:
u2

array([[-0.4472136 , -0.89442719],
       [-0.89442719,  0.4472136 ]])

In [113]:
vh2

array([[-4.08248290e-01, -8.66025404e-01, -2.88675135e-01],
       [-5.77350269e-01, -5.55111512e-17,  8.16496581e-01],
       [-7.07106781e-01,  5.00000000e-01, -5.00000000e-01]])

In [114]:
v

array([[ 0.40824829, -0.57735027,  0.70710678],
       [ 0.8660254 ,  0.        , -0.5       ],
       [ 0.28867513, -0.81649658,  0.5       ]])

In [116]:
matrix * vh2 * s2.T

array([[-0.00000000e+00, -1.22474487e+00,  0.00000000e+00],
       [-2.30940108e+00, -1.57009246e-16, -0.00000000e+00],
       [-0.00000000e+00,  7.07106781e-01,  0.00000000e+00]])

In [142]:
Latex(r'''        $$ 
M*M^{T} = \begin {pmatrix}
            0  & 1 & 1 \\
            {\sqrt 2} &   2 & 0 \\
            0  & 1 & 1
        \end{pmatrix} * \begin {pmatrix}
        0 & {\sqrt 2} & 0 \\
        1 & 2 & 1 \\
        1 & 0 & 1
        
        \end {pmatrix} = 
        \begin {pmatrix}
        2 & 2 & 2 \\
        2 & 6 & 2 \\
        2 & 2 & 2 \\
        \end {pmatrix} \\

Eignevalue = 8, Eigenvector = \begin {pmatrix} 1 & 2 & 1 \\ \end {pmatrix} \\
Eignevalue = 2, Eigenvector = \begin {pmatrix} 1 & -1 & 1 \\ \end {pmatrix} \\
Eignevalue = 0, Eigenvector = \begin {pmatrix} -1 & 0 & 1 \\ \end {pmatrix} \\ 
\\

{\sigma 1} = {\sqrt 8} = 2{\sqrt 2} \\
{\sigma 2} = {\sqrt 2} \\

{\Sigma} =  \begin {pmatrix}
2{\sqrt 2} & 0 & 0 \\
0 & {\sqrt 2} & 0 \\
0 & 0 & 0

\end {pmatrix} \\

u1 = \begin {pmatrix} {\sqrt 6} / 6  \\ {\sqrt 6} / 3 \\ {\sqrt 6} / 6 \end {pmatrix} \\
u2 = \begin {pmatrix} {\sqrt 3} / 3  \\ -{\sqrt 3} / 3 \\  {\sqrt 3} / 3 \end {pmatrix} \\
u3 = \begin {pmatrix} -{\sqrt 2} / 2  \\ 0 \\ {\sqrt 2} / 2 \end {pmatrix} \\

U = \begin {pmatrix}
{\sqrt 6} / 6 & {\sqrt 3} / 3 & -{\sqrt 2} / 2 \\
{\sqrt 6} / 3 & -{\sqrt 3} / 3  & 0 \\
{\sqrt 6} / 6 & {\sqrt 3} / 3 & {\sqrt 2} / 2
\end {pmatrix} \\ 

V = \begin {pmatrix}
{\sqrt 6} / 6 & -{\sqrt 3} / 3 & {\sqrt 2} / 2 \\
{\sqrt 3} / 2 & 0  & -1/2 \\
{\sqrt 3} / 6 & {\sqrt 6} / 3 & 1/2
\end {pmatrix}
        $$''')

<IPython.core.display.Latex object>

In [150]:
lst = ['a', 'ama', 'lsma']
{''.join(sorted(x)) for x in lst}

{'a', 'aam', 'alms'}

In [36]:
Latex(r'''        $$M = \begin {pmatrix}
            0  & 1 & 1 \\
            {\sqrt 2} &   2 & 0 \\
            0  & 1 & 1
        \end{pmatrix}
        \begin{pmatrix}
            x\\ y\\ z
        \end{pmatrix}
        =
        \begin{pmatrix}
           3x-10y+2z \\
           -x+7y+4z \\
           5x+z
        \end{pmatrix}$$''')

<IPython.core.display.Latex object>

In [12]:
Latex(r'''$$
Specifically, the singular value decomposition of an {\displaystyle m\times n}m\times n complex matrix M is a factorization of the form {\displaystyle \mathbf {M} =\mathbf {U\Sigma V^{*}} }{\displaystyle \mathbf {M} =\mathbf {U\Sigma V^{*}} }, where U is an {\displaystyle m\times m}m\times m complex unitary matrix, {\displaystyle \mathbf {\Sigma } }\mathbf{\Sigma} is an {\displaystyle m\times n}m\times n rectangular diagonal matrix with non-negative real numbers on the diagonal, and V is an {\displaystyle n\times n}n\times n complex unitary matrix. If M is real, U and V can also be guaranteed to be real orthogonal matrices. In such contexts, the SVD is often denoted {\displaystyle \mathbf {U\Sigma V^{T}} }{\displaystyle \mathbf {U\Sigma V^{T}} }.
$$''')

<IPython.core.display.Latex object>