In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import svd, eig
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib import cm
from sklearn.datasets import load_iris, fetch_olivetti_faces
from sklearn.decomposition import PCA, TruncatedSVD
from pandas.tools.plotting import scatter_matrix

# Always make it pretty.
plt.style.use('ggplot')

%matplotlib inline

## SVD for topic analysis

We can use SVD to determine what we call ***latent features***. This will be best demonstrated with an example.

### Example

Let's look at users ratings of different movies. The ratings are from 1-5. A rating of 0 means the user hasn't watched the movie.

|       | Matrix | Alien | StarWars | Casablanca | Titanic |
| ----- | ------ | ----- | -------- | ---------- | ------ |
| **Alice** |      1 |     2 |        2 |          0 |      0 |
|   **Bob** |      3 |     5 |        5 |          0 |      0 |
| **Cindy** |      4 |     4 |        4 |          0 |      0 |
|   **Dan** |      5 |     5 |        5 |          0 |      0 |
| **Emily** |      0 |     2 |        0 |          4 |      4 |
| **Frank** |      0 |     0 |        0 |          5 |      5 |
|  **Greg** |      0 |     1 |        0 |          2 |      2 |

Note that the first three movies (Matrix, Alien, StarWars) are Sci-fi movies and the last two (Casablanca, Titanic) are Romance. We will be able to mathematically pull out these topics!

Let's do the computation with Python.

In [2]:
M = np.array([[1, 2, 2, 0, 0],
              [3, 5, 5, 0, 0],
              [4, 4, 4, 0, 0],
              [5, 5, 5, 0, 0],
              [0, 2, 0, 4, 4],
              [0, 0, 0, 5, 5],
              [0, 1, 0, 2, 2]])



In [3]:
# Compute SVD
from numpy.linalg import svd

U, sigma, VT = svd(M)

print U
print sigma
print VT

[[ -2.12142669e-01   2.35889359e-02   3.05275882e-01  -2.55204195e-01
    5.08333980e-01  -6.55873811e-01   3.27936906e-01]
 [ -5.48509647e-01   6.39541961e-02   5.32055497e-01  -4.61448643e-01
   -2.54166990e-01   3.27936906e-01  -1.63968453e-01]
 [ -4.96897235e-01   6.71052975e-02  -3.13985067e-01   1.95838988e-01
    6.66545570e-01   3.65915279e-01  -1.82957639e-01]
 [ -6.21121543e-01   8.38816219e-02  -3.92481334e-01   2.44798735e-01
   -4.82403058e-01  -3.58319604e-01   1.79159802e-01]
 [ -1.24855356e-01  -5.96778016e-01   3.95328299e-01   5.21519583e-01
    6.25619756e-17  -2.00000000e-01  -4.00000000e-01]
 [ -4.41332838e-02  -7.33917008e-01  -4.19213292e-01  -5.32614583e-01
   -1.99840144e-16   5.17221534e-17  -2.58610767e-17]
 [ -6.24276782e-02  -2.98389008e-01   1.97664149e-01   2.60759791e-01
    3.52541217e-18   4.00000000e-01   8.00000000e-01]]
[  1.38366398e+01   9.52139961e+00   1.68783520e+00   1.02056846e+00
   1.57009246e-16]
[[ -5.02352330e-01  -6.19526758e-01  -5.969

In [4]:
# Make interpretable
movies = ['Matrix','Alien','StarWars','Casablanca','Titanic']
users = ['Alice','Bob','Cindy','Dan','Emily','Frank','Greg']

U, sigma, VT = (np.around(x,2) for x in (U,sigma,VT))

U = pd.DataFrame(U, index=users)
VT = pd.DataFrame(VT, columns=movies)

print U
print np.diag(sigma)
print VT


          0     1     2     3     4     5     6
Alice -0.21  0.02  0.31 -0.26  0.51 -0.66  0.33
Bob   -0.55  0.06  0.53 -0.46 -0.25  0.33 -0.16
Cindy -0.50  0.07 -0.31  0.20  0.67  0.37 -0.18
Dan   -0.62  0.08 -0.39  0.24 -0.48 -0.36  0.18
Emily -0.12 -0.60  0.40  0.52  0.00 -0.20 -0.40
Frank -0.04 -0.73 -0.42 -0.53 -0.00  0.00 -0.00
Greg  -0.06 -0.30  0.20  0.26  0.00  0.40  0.80
[[ 13.84   0.     0.     0.     0.  ]
 [  0.     9.52   0.     0.     0.  ]
 [  0.     0.     1.69   0.     0.  ]
 [  0.     0.     0.     1.02   0.  ]
 [  0.     0.     0.     0.     0.  ]]
   Matrix  Alien  StarWars  Casablanca  Titanic
0   -0.50  -0.62     -0.60       -0.06    -0.06
1    0.09  -0.05      0.11       -0.70    -0.70
2   -0.78   0.62      0.03       -0.07    -0.07
3    0.36   0.48     -0.79       -0.05    -0.05
4   -0.00  -0.00      0.00        0.71    -0.71


In [5]:
# Power
# singular values are square roots of eigenvalues
total_power = np.sum(sigma**2)
total_power

fraction_power = np.cumsum(sigma**2) / total_power
fraction_power

array([ 0.66957013,  0.98637933,  0.99636316,  1.        ,  1.        ])

In [6]:
# Keep only top two concepts
U = U.iloc[:,:2]
sigma = sigma[:2]
VT = VT.iloc[:2,:]

print U
print sigma
print VT

          0     1
Alice -0.21  0.02
Bob   -0.55  0.06
Cindy -0.50  0.07
Dan   -0.62  0.08
Emily -0.12 -0.60
Frank -0.04 -0.73
Greg  -0.06 -0.30
[ 13.84   9.52]
   Matrix  Alien  StarWars  Casablanca  Titanic
0   -0.50  -0.62     -0.60       -0.06    -0.06
1    0.09  -0.05      0.11       -0.70    -0.70


In [7]:
# Check the reconstruction

np.around(U.dot(np.diag(sigma)).dot(VT))

Unnamed: 0,Matrix,Alien,StarWars,Casablanca,Titanic
Alice,1.0,2.0,2.0,0.0,0.0
Bob,4.0,5.0,5.0,0.0,0.0
Cindy,4.0,4.0,4.0,-0.0,-0.0
Dan,4.0,5.0,5.0,-0.0,-0.0
Emily,0.0,1.0,0.0,4.0,4.0
Frank,-0.0,1.0,-0.0,5.0,5.0
Greg,0.0,1.0,0.0,2.0,2.0


## Queries

In [8]:
# Which movies are most similar to Matrix?
from scipy.spatial.distance import cosine

matrix = VT['Matrix']
distances = [cosine(matrix, VT[col]) for col in VT]
pd.Series(distances, index=movies)


Matrix        0.000000
Alien         0.033242
StarWars      0.000005
Casablanca    1.092455
Titanic       1.092455
dtype: float64

In [9]:
# Make recommendations for a new user
my_ratings = np.array([[5, 0, 4, 0, 3]])

# Translate to weighted concept space
my_weighted_concept = my_ratings.dot(VT.T)
my_weighted_concept


# Translate back to rating space
new_rating = my_weighted_concept.dot(VT)
new_rating

array([[ 2.4311,  3.2101,  2.9149,  1.1518,  1.1518]])

It looks like the best recommendation for a new movie for me to watch is Alien.

#### Which user am I most similar to?

Translate to user space by multiplying by $V \Sigma^{-1}$ _on the right_

In [10]:
sigma_inv = np.diag(1/sigma)

# Translate to concept space
my_concept = my_ratings.dot(VT.T).dot(sigma_inv)
my_concept

# Find distance to other users
distances = [cosine(my_concept, row) for name,row in U.iterrows()]
pd.Series(distances, index=users)

Alice    0.090329
Bob      0.096108
Cindy    0.109543
Dan      0.104693
Emily    0.493822
Frank    0.621577
Greg     0.493822
dtype: float64

In [11]:
for name, row in U.iterrows():
    print row

0   -0.21
1    0.02
Name: Alice, dtype: float64
0   -0.55
1    0.06
Name: Bob, dtype: float64
0   -0.50
1    0.07
Name: Cindy, dtype: float64
0   -0.62
1    0.08
Name: Dan, dtype: float64
0   -0.12
1   -0.60
Name: Emily, dtype: float64
0   -0.04
1   -0.73
Name: Frank, dtype: float64
0   -0.06
1   -0.30
Name: Greg, dtype: float64
