`April 20, 2022`

# Cosine Similarity

\begin{align}
Cosine(x,y)= \frac{\sum_{i = 1}^{n}x_i y_i
}{\sqrt{\sum_{i=1}^{n}x_i^2} \sqrt{\sum_{i=1}^{n}y_i^2}}
\end{align}

> ## Cosine Similarity Illustration 1

In [None]:
import numpy as np
import pandas as pd

In [None]:
cosine = (1*1+1*1+1*1+1*0)/(np.sqrt(4)*np.sqrt(3))
print(cosine)

0.8660254037844387


In [None]:
cosine = (1*0+1*1+1*1+1*0)/(np.sqrt(4)*np.sqrt(2))
print(cosine)

0.7071067811865475


In [None]:
cosine = (1*1+1*0+1*0+1*0)/(np.sqrt(4)*np.sqrt(1))
print(cosine)

0.5


> ## Cosine Similarity Illustration 2

In [None]:
cosine = (4*3+5*5+5*5+4*0)/(np.sqrt(4**2+5**2+5**2+4**2)*np.sqrt(3**2+5**2+5**2+0**2))
print(cosine)

0.891371527293353


In [None]:
cosine = (4*0+5*5+5*5+4*0)/(np.sqrt(4**2+5**2+5**2+4**2)*np.sqrt(0**2+5**2+5**2+0**2))
print(cosine)

0.7808688094430302


In [None]:
cosine = (4*5+5*0+5*0+4*0)/(np.sqrt(4**2+5**2+5**2+4**2)*np.sqrt(4**2+0**2+0**2+0**2))
print(cosine)

0.5521576303742327


# Content Based Filtering

> ## Content Based Filtering One User

In [None]:
import pandas as pd
import numpy as np

In [None]:
movies = ["Terminator 2","Interstellar","Ant Man 2","3 Idiots"]
scores = [7,9,8,9]
action = [1,0,1,0]
scifi = [1,1,1,0]
adventure = [0,1,1,0]
comedy = [0,0,1,1]
drama = [0,1,0,1]

df_movies = pd.DataFrame({
    'movie':movies,
    'scores':scores,
    'Action':action,
    'Sci-Fi':scifi,
    'Adventure':adventure,
    'Comedy':comedy,
    'Drama':drama
})

df_movies

Unnamed: 0,movie,scores,Action,Sci-Fi,Adventure,Comedy,Drama
0,Terminator 2,7,1,1,0,0,0
1,Interstellar,9,0,1,1,0,1
2,Ant Man 2,8,1,1,1,1,0
3,3 Idiots,9,0,0,0,1,1


In [None]:
df_movies2 = df_movies.copy()
df_movies2.drop('movie', axis = 1, inplace = True)

for i in ['Action','Sci-Fi','Adventure','Comedy','Drama']:
  df_movies2[i] = df_movies2['scores']*df_movies2[i] 

df_movies2.drop('scores', axis = 1, inplace = True)
movie_scoring = df_movies2.sum()/df_movies2.sum().sum()
movie_scoring

Action       0.164835
Sci-Fi       0.263736
Adventure    0.186813
Comedy       0.186813
Drama        0.197802
dtype: float64

In [None]:
movies = ["Titanic",'Martian','GOTG Vol 2']
action = [1,0,1]
scifi = [1,1,1]
adventure = [0,1,1]
comedy = [0,0,1]
drama = [0,1,0]

df_movies_recommendation = pd.DataFrame({
    'movie':movies,
    'Action':action,
    'Sci-Fi':scifi,
    'Adventure':adventure,
    'Comedy':comedy,
    'Drama':drama
})

df_movies_recommendation

Unnamed: 0,movie,Action,Sci-Fi,Adventure,Comedy,Drama
0,Titanic,1,1,0,0,0
1,Martian,0,1,1,0,1
2,GOTG Vol 2,1,1,1,1,0


In [None]:
movie_scoring

Action       0.164835
Sci-Fi       0.263736
Adventure    0.186813
Comedy       0.186813
Drama        0.197802
dtype: float64

In [None]:
for i in ['Action','Sci-Fi','Adventure','Comedy','Drama']:
  df_movies_recommendation[i] = df_movies_recommendation[i]*movie_scoring[i]

df_movies_recommendation['movie rating prediction'] = df_movies_recommendation.sum(axis = 1)
df_movies_recommendation

Unnamed: 0,movie,Action,Sci-Fi,Adventure,Comedy,Drama,movie rating prediction
0,Titanic,0.164835,0.263736,0.0,0.0,0.0,0.428571
1,Martian,0.0,0.263736,0.186813,0.0,0.197802,0.648352
2,GOTG Vol 2,0.164835,0.263736,0.186813,0.186813,0.0,0.802198


Recommendation Order for the user:
- GOTG Vol 2
- Martian
- Titanic

> ## Content Based Filtering Multiple User

In [None]:
import pandas as pd
import numpy as np

In [None]:
movies = ["Terminator 2","Interstellar","Ant Man 2","3 Idiots"]
action = [1,0,1,0]
scifi = [1,1,1,0]
adventure = [0,1,1,0]
comedy = [0,0,1,1]
drama = [0,1,0,1]

df_item_features = pd.DataFrame({
    'movie':movies,
    'Action':action,
    'Sci-Fi':scifi,
    'Adventure':adventure,
    'Comedy':comedy,
    'Drama':drama
})

df_item_features

Unnamed: 0,movie,Action,Sci-Fi,Adventure,Comedy,Drama
0,Terminator 2,1,1,0,0,0
1,Interstellar,0,1,1,0,1
2,Ant Man 2,1,1,1,1,0
3,3 Idiots,0,0,0,1,1


In [None]:
user = ['user 1','user 2','user 3','user 4']
terminator_2 = [7,8,9,0]
interstellar = [9,0,0,7]
ant_man_2 = [8,6,0,0]
three_idiots = [9,5,10,9]

df_user_items = pd.DataFrame({
    'user':user,
    'Terminator 2':terminator_2,
    'Interstellar':interstellar,
    'Ant Man 2':ant_man_2,
    '3 Idiots':three_idiots
})

df_user_items

Unnamed: 0,user,Terminator 2,Interstellar,Ant Man 2,3 Idiots
0,user 1,7,9,8,9
1,user 2,8,0,6,5
2,user 3,9,0,0,10
3,user 4,0,7,0,9


In [None]:
arr_user_items = np.array(df_user_items.drop('user', axis = 1))
arr_item_features = np.array(df_item_features.drop('movie', axis = 1))

n_user = arr_user_items.shape[0]
n_item = arr_user_items.shape[1]
n_feature = arr_item_features.shape[1]

arr_user_items_score = np.empty((n_user,n_item))
arr_user_feature = np.empty((n_user,n_feature))

for i in range(0,n_user):
  # print(arr_user_items[i,:])
  user_feature = np.matmul(arr_user_items[i,:],arr_item_features)
  # print(user_feature)
  user_feature = user_feature/user_feature.sum()
  arr_user_feature[i,:] = user_feature

In [None]:
df_user_feature = pd.DataFrame(arr_user_feature)
df_user_feature.columns = df_item_features.columns[1:]
df_user_feature.index = user
df_user_feature

Unnamed: 0,Action,Sci-Fi,Adventure,Comedy,Drama
user 1,0.164835,0.263736,0.186813,0.186813,0.197802
user 2,0.28,0.28,0.12,0.22,0.1
user 3,0.236842,0.236842,0.0,0.263158,0.263158
user 4,0.0,0.179487,0.179487,0.230769,0.410256


In [None]:
for i in range(0, n_user):
  user_item_score = np.matmul(arr_item_features,arr_user_feature[i,:])
  arr_user_items_score[i,:] = user_item_score

arr_user_items_score_unwatched = np.where(arr_user_items == 0,arr_user_items_score,0)

df_user_items_score_unwatched = pd.DataFrame(arr_user_items_score_unwatched)
df_user_items_score_unwatched.columns = movies
df_user_items_score_unwatched.index = user
df_user_items_score_unwatched

Unnamed: 0,Terminator 2,Interstellar,Ant Man 2,3 Idiots
user 1,0.0,0.0,0.0,0.0
user 2,0.0,0.5,0.0,0.0
user 3,0.0,0.5,0.736842,0.0
user 4,0.179487,0.0,0.589744,0.0


Recommendation Order for unwatched movies
- User 3 : Ant Man 2, Interstellar
- User 4 : Ant Man 2, Terminator 2


In [None]:
movies = ["Titanic","Martian","GOTG Vol 2"]
action = [1,0,1]
scifi = [1,1,1]
adventure = [0,1,1]
comedy = [0,0,1]
drama = [0,1,0]

df_item_features_new = pd.DataFrame({
    'movie':movies,
    'Action':action,
    'Sci-Fi':scifi,
    'Adventure':adventure,
    'Comedy':comedy,
    'Drama':drama
})

df_item_features_new

Unnamed: 0,movie,Action,Sci-Fi,Adventure,Comedy,Drama
0,Titanic,1,1,0,0,0
1,Martian,0,1,1,0,1
2,GOTG Vol 2,1,1,1,1,0


In [None]:
arr_item_features_new = np.array(df_item_features_new.drop('movie', axis = 1))

n_item_new = df_item_features_new.shape[0]

arr_user_items_score_new = np.empty((n_user,n_item_new))

for i in range(0, n_user):
  user_item_score = np.matmul(arr_item_features_new,arr_user_feature[i,:])
  arr_user_items_score_new[i,:] = user_item_score

df_user_items_score_new = pd.DataFrame(arr_user_items_score_new)
df_user_items_score_new.index = user
df_user_items_score_new.columns = df_item_features_new['movie']
df_user_items_score_new

movie,Titanic,Martian,GOTG Vol 2
user 1,0.428571,0.648352,0.802198
user 2,0.56,0.5,0.9
user 3,0.473684,0.5,0.736842
user 4,0.179487,0.769231,0.589744


Recommendation Order
- User 1 : GOTG Vol 2, Martian, Titanic
- User 2 : GOTG Vol 2, Titanic, Martian
- User 3 : GOTG Vol 2, Martian, Titanic
- User 4 : Martian, GOTG Vol 2, Titanic