# Content Based Filtering

In [1]:
import math
import numpy as np
import pandas as pd
import sys
sys.path.append('scripts')

%run "./scripts/liblecture.py"
from scripts.liblecture import *

In [2]:
np.set_printoptions(precision=2, linewidth=120)

Create a binary valued matrix (weight matrix) that holds the item's genre occurence in each cell.

In [3]:
matrix = np.array([
    [0, 1, 0, 0, 0, 0, 0, 0],
    [1, 0, 1, 0, 0, 0, 0, 1],
    [0, 1, 0, 0, 1, 0, 0, 0],
    [1, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 1, 0, 0, 0, 0],
    [0, 0, 0, 1, 1, 0, 0, 0],
    [0, 1, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 1, 1, 0, 0, 0],
    [0, 0, 0, 0, 0, 1, 1, 1],
    [0, 0, 1, 1, 0, 0, 0, 0],
    [0, 1, 0, 1, 1, 0, 0, 0]
])
print("Items :",matrix.shape[0])
print("Genres:",matrix.shape[1])

#14:
Items : 11


#15:
Genres: 8




We will compute weights by TF-IDF (Term Frequency - Inverse Document Frequency) scheme.
* We will use 1 for TF of each genre in items.
* We will compute IDFs for genres and assign them in column-wise manner.

$$ \text{tf-idf}(t, d) = \text{tf}(t, d) \times \text{idf}(t) $$ 
where:

- $ \text{tf}(t, d) $ represents the Term Frequency of a term $t$ in a document $d$, which is the frequency of term $t$ occurring in document $d$. It can be calculated using various methods such as raw term frequency, logarithmic term frequency, or augmented term frequency.

- $\text{idf}(t)$ represents the Inverse Document Frequency of a term $t$, which measures the importance of term $t$ in the entire collection of documents. It can be calculated as:

$$ \text{idf}(t) = \log \left( \frac{df(t)}{N} \right) $$

where $N$ is the total number of documents in the collection, and $df(t)$ is the document frequency of term $t$, which is the number of documents that contain term $t$. The $\log$ function is usually taken with base 10 or natural logarithm (base $e$).

Create a matrix that will hold weights for all items.

In [4]:
totalItems = matrix.shape[0]
totalGenres = matrix.shape[1]

weights = np.zeros(matrix.shape)

for i in range(0, totalGenres):
    col = matrix[:,i]
    idf = math.log10(totalItems/col.sum())
    for j in range(0, totalItems):
        weights[j, i] = matrix[j, i] * idf

See the weights.

In [5]:
weights

array([[0.  , 0.44, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.74, 0.  , 0.74, 0.  , 0.  , 0.  , 0.  , 0.74],
       [0.  , 0.44, 0.  , 0.  , 0.44, 0.  , 0.  , 0.  ],
       [0.74, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.34, 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.34, 0.44, 0.  , 0.  , 0.  ],
       [0.  , 0.44, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.34, 0.44, 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 1.04, 1.04, 0.74],
       [0.  , 0.  , 0.74, 0.34, 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.44, 0.  , 0.34, 0.44, 0.  , 0.  , 0.  ]])

### $l_2$-norm
Let's define $norm_2$ function for computing the $l_2$ norm of a vector, which is represented as an array in np.

$$norm_2(v)=||v||_2=\sqrt{\sum_{\forall i}v_i^2}$$

In [6]:
def norm2(arr):
    sum = 0.0
    for i in range(0, len(arr)):
        sum += arr[i] * arr[i]
    return math.sqrt(sum)

In [7]:
print(weights[0],"=>",norm2(weights[0]))
print(weights[1],"=>",norm2(weights[1]))

#1:
[0.   0.44 0.   0.   0.   0.   0.   0.  ] => 0.43933269383026263


#2:
[0.74 0.   0.74 0.   0.   0.   0.   0.74] => 1.282345794232371




### Inner Product

Let's define dot function for computing the inner product between two vectors.

$$dot(u, v)=u\cdot v=\sum_{\forall i}{u_i \times v_i}$$

In [8]:
def dot(arr1, arr2):
    sum = 0.0
    for i in range(0, len(arr1)):
        sum += arr1[i] * arr2[i]
    return sum

In [9]:
print(dot(weights[0], weights[1]))
print(dot(weights[0], weights[2]))
print(dot(weights[0], weights[3]))

#1:
0.0


#2:
0.19301321586815529


#3:
0.0




### Cosine Similarity

Let's define the cosine similarity function for two vectors.

$$ cosine(u, v)=\frac{dot(u,v)}{norm2(u)norm2(v)}=\frac{u\cdot v}{||u||_2||v||_2} $$

In [10]:
def cosine(arr1, arr2):
    return dot(arr1, arr2)/(norm2(arr1)*norm2(arr2))

In [11]:
print(cosine(weights[0], weights[1]))
print(cosine(weights[0], weights[2]))
print(cosine(weights[0], weights[3]))

#1:
0.0


#2:
0.7071067811865475


#3:
0.0




### Pearson Similarity

Let's define the pearson similarity function for two vectors.

$$ pearson(u, v) = \frac{cov(u, v)}{std(u) \cdot std(v)} $$

In [12]:
def pearson(u, v):
    covariance = np.cov(u, v)[0][1]
    
    std_u = np.std(u)
    std_v = np.std(v)
    
    correlation = covariance / (std_u * std_v) if (std_u * std_v) != 0 else 0
    
    return correlation

In [13]:
print(pearson(weights[0], weights[1]))
print(pearson(weights[0], weights[2]))
print(pearson(weights[0], weights[3]))

#1:
-0.33459431072521134


#2:
0.7481756236662594


#3:
-0.16326530612244894




Or, we can do this more conveniently using the *numpy* library

In [14]:
from numpy import linalg as LA

In [15]:
norms = LA.norm(weights, ord=2, axis=1)
norms

array([0.44, 1.28, 0.62, 0.74, 0.34, 0.56, 0.44, 0.56, 1.65, 0.82, 0.71])

In [16]:
dots = np.matmul(weights, weights.T)
dots

array([[0.19, 0.  , 0.19, 0.  , 0.  , 0.  , 0.19, 0.  , 0.  , 0.  , 0.19],
       [0.  , 1.64, 0.  , 0.55, 0.  , 0.  , 0.  , 0.  , 0.55, 0.55, 0.  ],
       [0.19, 0.  , 0.39, 0.  , 0.  , 0.19, 0.19, 0.19, 0.  , 0.  , 0.39],
       [0.  , 0.55, 0.  , 0.55, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.12, 0.12, 0.  , 0.12, 0.  , 0.12, 0.12],
       [0.  , 0.  , 0.19, 0.  , 0.12, 0.31, 0.  , 0.31, 0.  , 0.12, 0.31],
       [0.19, 0.  , 0.19, 0.  , 0.  , 0.  , 0.19, 0.  , 0.  , 0.  , 0.19],
       [0.  , 0.  , 0.19, 0.  , 0.12, 0.31, 0.  , 0.31, 0.  , 0.12, 0.31],
       [0.  , 0.55, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 2.72, 0.  , 0.  ],
       [0.  , 0.55, 0.  , 0.  , 0.12, 0.12, 0.  , 0.12, 0.  , 0.67, 0.12],
       [0.19, 0.  , 0.39, 0.  , 0.12, 0.31, 0.19, 0.31, 0.  , 0.12, 0.5 ]])

In [17]:
sims = np.divide(np.divide(dots, norms).T, norms)
sims

array([[1.  , 0.  , 0.71, 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.62],
       [0.  , 1.  , 0.  , 0.58, 0.  , 0.  , 0.  , 0.  , 0.26, 0.52, 0.  ],
       [0.71, 0.  , 1.  , 0.  , 0.  , 0.56, 0.71, 0.56, 0.  , 0.  , 0.88],
       [0.  , 0.58, 0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.  , 0.61, 0.  , 0.61, 0.  , 0.42, 0.48],
       [0.  , 0.  , 0.56, 0.  , 0.61, 1.  , 0.  , 1.  , 0.  , 0.26, 0.79],
       [1.  , 0.  , 0.71, 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.62],
       [0.  , 0.  , 0.56, 0.  , 0.61, 1.  , 0.  , 1.  , 0.  , 0.26, 0.79],
       [0.  , 0.26, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  ],
       [0.  , 0.52, 0.  , 0.  , 0.42, 0.26, 0.  , 0.26, 0.  , 1.  , 0.2 ],
       [0.62, 0.  , 0.88, 0.  , 0.48, 0.79, 0.62, 0.79, 0.  , 0.2 , 1.  ]])

### Movies Weight Matrix on Genres

Read movie metadata from a csv file.

In [18]:
movies = pd.read_csv('data/movies_w_imgurl.csv')

Split genres and stack genres into one column.

In [19]:
movieGenres = pd.DataFrame(data=movies['genres'].str.split('|').apply(pd.Series, 1).stack(), columns=['genre'])
movieGenres.index = movieGenres.index.droplevel(1)

Count movies that have each genre and then compute IDF of genres.

In [20]:
genres = pd.DataFrame(data = movieGenres.groupby('genre')['genre'].count())
genres.columns = ['movieCount']
totalItems = movies.shape[0]
genres['idf'] = genres['movieCount'].apply(lambda x: math.log10(totalItems/x))

Join genre's IDF to movie genre DataFrame.

In [21]:
movieGenreWeights = movieGenres.join(genres['idf'], on='genre')

In [22]:
movieWeights = movies[['movieId']]
for genre in genres.index:
    movieWeights = movieWeights.join(movieGenreWeights[movieGenreWeights['genre'] == genre][['idf']].rename(columns={'idf':genre}))
movieWeights.fillna(0, inplace=True)

movieWeights

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.0,0.00,0.91,1.31,1.19,0.44,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
1,2,0.0,0.00,0.91,0.00,1.19,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
2,3,0.0,0.00,0.00,0.00,0.00,0.44,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.77,0.00,0.0,0.0,0.0
3,4,0.0,0.00,0.00,0.00,0.00,0.44,0.0,0.00,0.32,...,0.0,0.0,0.0,0.0,0.0,0.77,0.00,0.0,0.0,0.0
4,5,0.0,0.00,0.00,0.00,0.00,0.44,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,162672,0.0,0.00,0.91,0.00,0.00,0.00,0.0,0.00,0.32,...,0.0,0.0,0.0,0.0,0.0,0.77,0.00,0.0,0.0,0.0
9121,163056,0.0,0.77,0.91,0.00,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,1.06,0.0,0.0,0.0
9122,163949,0.0,0.00,0.00,0.00,0.00,0.00,0.0,1.27,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
9123,164977,0.0,0.00,0.00,0.00,0.00,0.44,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0


### Movie-Movie Cosine Similarity Matrix

Compute $l_2$-norm of movies.

In [23]:
movieNorms = pd.DataFrame(data=LA.norm(movieWeights.iloc[:,1:].values, ord=2, axis=1), index=movieWeights.index, columns=['norm2'])

Normalize movie vector so that similarity can be computed simply by inner product between vectors.

$$ cosine(u, v)=\frac{\sum_{\forall i}{u_i v_i}}{||u||_2||v||_2}=\sum_{\forall i}{\frac{u_i v_i}{||u||_2||v||_2}}=\sum_{\forall i}{\frac{u_i}{||u||_2}\frac{v_i}{||v||_2}}=u'\cdot v'$$

In [24]:
normalizedMovieWeights = pd.DataFrame(index=movieWeights.index)
norms = movieNorms['norm2']
for genre in genres.index:
    normalizedMovieWeights[genre] = movieWeights[genre].divide(norms)
normalizedMovieWeights

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.00,0.39,0.56,0.51,0.19,0.0,0.0,0.00,0.49,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
1,0.0,0.00,0.48,0.00,0.63,0.00,0.0,0.0,0.00,0.61,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
2,0.0,0.00,0.00,0.00,0.00,0.50,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.87,0.00,0.0,0.0,0.0
3,0.0,0.00,0.00,0.00,0.00,0.47,0.0,0.0,0.34,0.00,0.0,0.0,0.0,0.0,0.0,0.82,0.00,0.0,0.0,0.0
4,0.0,0.00,0.00,0.00,0.00,1.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,0.0,0.00,0.74,0.00,0.00,0.00,0.0,0.0,0.26,0.00,0.0,0.0,0.0,0.0,0.0,0.62,0.00,0.0,0.0,0.0
9121,0.0,0.39,0.46,0.00,0.00,0.00,0.0,0.0,0.00,0.58,0.0,0.0,0.0,0.0,0.0,0.00,0.54,0.0,0.0,0.0
9122,0.0,0.00,0.00,0.00,0.00,0.00,0.0,1.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0
9123,0.0,0.00,0.00,0.00,0.00,1.00,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0


In [25]:
np.isnan(normalizedMovieWeights).any()

(no genres listed)    False
Action                False
Adventure             False
Animation             False
Children              False
Comedy                False
Crime                 False
Documentary           False
Drama                 False
Fantasy               False
Film-Noir             False
Horror                False
IMAX                  False
Musical               False
Mystery               False
Romance               False
Sci-Fi                False
Thriller              False
War                   False
Western               False
dtype: bool

In [26]:
np.matmul(normalizedMovieWeights, normalizedMovieWeights.T)

  np.matmul(normalizedMovieWeights, normalizedMovieWeights.T)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9115,9116,9117,9118,9119,9120,9121,9122,9123,9124
0,1.00,0.81,0.09,0.09,0.19,0.00,0.09,0.64,0.00,0.25,...,0.00,0.19,0.00,0.00,0.00,0.29,0.47,0.0,0.19,0.0
1,0.81,1.00,0.00,0.00,0.00,0.00,0.00,0.80,0.00,0.32,...,0.00,0.23,0.00,0.00,0.00,0.36,0.58,0.0,0.00,0.0
2,0.09,0.00,1.00,0.94,0.50,0.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.63,0.54,0.00,0.0,0.50,0.0
3,0.09,0.00,0.94,1.00,0.47,0.00,0.94,0.00,0.00,0.00,...,0.08,0.00,0.34,0.34,0.60,0.60,0.00,0.0,0.47,0.0
4,0.19,0.00,0.50,0.47,1.00,0.00,0.50,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,1.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9120,0.29,0.36,0.54,0.60,0.00,0.00,0.54,0.45,0.00,0.48,...,0.06,0.36,0.26,0.26,0.46,1.00,0.34,0.0,0.00,0.0
9121,0.47,0.58,0.00,0.00,0.00,0.22,0.00,0.28,0.39,0.52,...,0.00,0.69,0.00,0.00,0.00,0.34,1.00,0.0,0.00,0.0
9122,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.0,0.00,1.0
9123,0.19,0.00,0.50,0.47,1.00,0.00,0.50,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,1.00,0.0


Create item-item similarity matrix

In [27]:
sims = pd.DataFrame(np.matmul(normalizedMovieWeights, normalizedMovieWeights.T))
sims.index = movieWeights['movieId']
sims.columns = movieWeights['movieId']
sims

  sims = pd.DataFrame(np.matmul(normalizedMovieWeights, normalizedMovieWeights.T))


movieId,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.00,0.81,0.09,0.09,0.19,0.00,0.09,0.64,0.00,0.25,...,0.00,0.19,0.00,0.00,0.00,0.29,0.47,0.0,0.19,0.0
2,0.81,1.00,0.00,0.00,0.00,0.00,0.00,0.80,0.00,0.32,...,0.00,0.23,0.00,0.00,0.00,0.36,0.58,0.0,0.00,0.0
3,0.09,0.00,1.00,0.94,0.50,0.00,1.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.63,0.54,0.00,0.0,0.50,0.0
4,0.09,0.00,0.94,1.00,0.47,0.00,0.94,0.00,0.00,0.00,...,0.08,0.00,0.34,0.34,0.60,0.60,0.00,0.0,0.47,0.0
5,0.19,0.00,0.50,0.47,1.00,0.00,0.50,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,1.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162672,0.29,0.36,0.54,0.60,0.00,0.00,0.54,0.45,0.00,0.48,...,0.06,0.36,0.26,0.26,0.46,1.00,0.34,0.0,0.00,0.0
163056,0.47,0.58,0.00,0.00,0.00,0.22,0.00,0.28,0.39,0.52,...,0.00,0.69,0.00,0.00,0.00,0.34,1.00,0.0,0.00,0.0
163949,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.0,0.00,1.0
164977,0.19,0.00,0.50,0.47,1.00,0.00,0.50,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,1.00,0.0


### Recommend Movies based on Predicted Ratings

In [28]:
displayMovies(movies, [5, 14, 32])

Read ratings as train and test datasets.

In [29]:
ratings = pd.read_csv('data/ratings-9_1.csv')
train = ratings[ratings['type'] == 'train'][['userId', 'movieId', 'rating']]
test = ratings[ratings['type'] == 'test'][['userId', 'movieId', 'rating']]

Recommend items for test users.

In [30]:
users = [33, 39, 77, 144, 238]
userId = 33
userRatings = train[train['userId'] == userId][['movieId', 'rating']]

In [31]:
topRatings = userRatings.sort_values(by='rating', ascending=False).head(5)
displayMovies(movies, topRatings['movieId'].values, topRatings['rating'].values)

In [32]:
recSimSums = sims.loc[userRatings['movieId'].values, :].sum().values
recSimSums = recSimSums+1
recWeightedRatingSums = np.matmul(sims.loc[userRatings['movieId'].values, :].T.values, userRatings['rating'].values)
recItemRatings = pd.DataFrame(data=np.divide(recWeightedRatingSums, recSimSums), index=sims.index)
recItemRatings.columns = ['prediction']

In [33]:
displayMovies(movies, recItemRatings.sort_values(by='prediction', ascending=False).head(30).index)

Compute MAE and RMSE for the test user.

In [34]:
userTestRatings = pd.DataFrame(data=test[test['userId'] == userId])
temp = userTestRatings.join(recItemRatings.loc[userTestRatings['movieId']], on='movieId')
mae = (temp['rating'] - temp['prediction']).abs().mean()
rmse = math.sqrt((temp['rating'] - temp['prediction']).pow(2).mean())
print(" MAE:", mae)
print("RMSE:", rmse)

#5:
 MAE: 0.968152436040529


#6:
RMSE: 1.1346645551383496


