# Content-based Recommendations with PCA
## Similar movies have similar tags. How well is this similarity captured with PCA?

In [None]:
df = sqlContext.read.csv('data/movielens-tag-relevance.csv', header=True, inferSchema=True)

In [None]:
import random
colsToShow = ['title'] + [random.choice(df.columns) for i in range(4)]
df.select(*colsToShow).show()

In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [None]:
newCols = []

for c in df.columns:
    if "." in c:
        new_column = c.replace('.', '_')
        df = df.withColumnRenamed(c, new_column)
        newCols.append(new_column)
    else:
        newCols.append(c)

In [None]:
assembler = VectorAssembler(inputCols=[c for c in newCols if c != 'title'],
                            outputCol='features')
scaler    = StandardScaler(inputCol='features', outputCol='normFeats', withMean=True)

df          = assembler.transform(df)
scalerModel = scaler.fit(df)
df          = scalerModel.transform(df)

## PCA

In [None]:
rdd = df.select('normFeats').rdd

In [None]:
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vectors

In [None]:
vectors = rdd.map(Vectors.dense)

In [None]:
matrix = RowMatrix(vectors)

## Get the PCs

In [None]:
pc = matrix.computePrincipalComponents(500)
matrix_reduced = matrix.multiply(pc)

## Nearest Neighbour Search in PC space

In [None]:
import numpy as np
X = matrix_reduced.rows.map(np.array).collect()
X = np.array(X)

In [None]:
titles = df.select('title').toPandas()

In [None]:
import pandas as pd
pdf = pd.DataFrame(X, index=titles['title'])

In [None]:
pdf.head()

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
n_pcs = 2
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Toy Story (1995)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()

<table>
    <tr>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/1/13/Toy_Story.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/c/cf/Poster_for_Dirty_Laundry.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/e/ef/Empire_of_Dreams.png"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/d/df/The_Ten_Commandments_%281956_film_poster%29.jpg/313px-The_Ten_Commandments_%281956_film_poster%29.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/2/2a/Bloodofheroes.jpg"></img></td>
        
</table>

## Increase the number of Principal Components

In [None]:
n_pcs = 10
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Toy Story (1995)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()

<table>
    <tr>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/1/13/Toy_Story.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/2/29/Finding_Nemo.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/6/63/Monsters_Inc.JPG"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/c/cc/A_Bug%27s_Life.jpg"></img></td>
        <td><img src="https://upload.wikimedia.org/wikipedia/en/5/50/RatatouillePoster.jpg"></img></td>
        
</table>

In [None]:
n_pcs = 100
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Toy Story (1995)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()

In [None]:
n_pcs = 500
nn = NearestNeighbors()
nn = nn.fit(X[:, :n_pcs])
neighbors = nn.kneighbors(pdf.loc['Toy Story (1995)'].values[:n_pcs].reshape(1, -1), return_distance=False)
pdf.index[neighbors.ravel()].tolist()