# Cosine Similarity

Algorithm is testing how similary between 2 entities (user-user or item-item). 

In [1]:
import pandas as pd
import numpy as np

## Model Explain

In [8]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()

In [20]:
x = np.linspace(-np.pi, np.pi, 100)
y = np.cos(x)

p = figure(plot_width=600, plot_height=300, title='Cosine of Unit circle')
p.line(x=x, y=y)

p.y_range.start = 0.
p.x_range.start = -np.pi / 2
p.x_range.end = np.pi / 2
show(p)

## Apply Cosine to 2D-point

\begin{equation}
cos(A - B) = cos(A)cos(B) + sin(A)sin(B)\\
= (\frac{A_x}{\sqrt{A_x ^ 2 + A_y ^ 2}} * \frac{B_x}{\sqrt{B_x ^ 2 + B_y ^ 2}}) + (\frac{A_y}{\sqrt{A_x ^ 2 + A_y ^ 2}} * \frac{B_y}{\sqrt{B_x ^ 2 + B_y ^ 2}}) \\
= \frac{A_x B_x + A_y B_y}{\sqrt{A_x ^ 2 + A_y ^ 2}{\sqrt{B_x ^ 2 + B_y ^ 2}}}
\end{equation}

## Cosine of vector

\begin{equation}
cos(V_A, V_B) = \frac{\sum_{}{A_i, B_i}}{\sqrt{\sum_{}{A_i ^ 2}}\sqrt{\sum_{}{B_i ^ 2}}} \\
When \; A_i \text{is i-th member of}\; V_A\; and \; B_i \text{is i-th member of}\; V_B
\end{equation}

In [3]:
ratings = pd.read_csv('../ratings.csv')
ratings['count'] = 1

rating_matrix = ratings.pivot(columns='member', index='follower_id', values='count')
rating_matrix.fillna(0, inplace=True)
rating_matrix.head(5)

member,Can,Cherprang,Izurina,Jaa,Jan,Jane,Jennis,Jib,Kaew,Kaimook,...,Music,Namneung,Namsai,Nink,Noey,Orn,Piam,Pun,Satchan,Tarwaan
follower_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
758518,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
989241,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3219851,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3546211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3957551,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Item-Item

### Normalize Data::User-wise L2 Norm

Reduce effect of `DD` or ones who oshi(love) so many member and boost up for who love very few group of them.

\begin{equation*}
\sqrt{\sum_{i=0}^{n}{R_i^2}} \quad\quad \text{for}\; R_i \in \text{Rating of}\; I_u
\end{equation*}

In [21]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

output_notebook()

In [22]:
from math import sqrt
from bokeh.palettes import Plasma

p = figure(plot_width=500, plot_height=450, title='L2-Norm on unary data')

x = np.arange(1, 10)

for i in x:
    p.line(y=[(1. / np.sqrt(i)) if j <= i else 0. for j in x], x=x, color=Plasma[10][i])

# p.line(x=x, y=(1. / np.sqrt(x)), color=Plasma[10][0])

p.y_range.start = 0

show(p)

In [3]:
user_norm = 1 / (rating_matrix).sum(axis=1) ** (0.5)
l2norm_rating = rating_matrix.mask(rating_matrix > 0, user_norm, axis=0)
l2norm_rating.head(10)

member,Can,Cherprang,Izurina,Jaa,Jan,Jane,Jennis,Jib,Kaew,Kaimook,...,Music,Namneung,Namsai,Nink,Noey,Orn,Piam,Pun,Satchan,Tarwaan
follower_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
758518,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
989241,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3219851,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3546211,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3957551,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4476611,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4541451,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4581431,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5060291,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5202411,0.0,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Similarity Function :: Cosine Simiarlity

\begin{equation*}
\cos({\theta})=\frac{\sum_{u \in P}{R_{i,u} R_{j,u}}}{\sqrt{\sum_{u \in P}{R_{i,u}}^2}\sqrt{\sum_{u \in P}{R_{j,u}}^2}}\\
\text{When P is set of users}
\end{equation*}

In [5]:
ratings = l2norm_rating.values.transpose()
l2norm = np.sqrt(np.sum(ratings ** 2, axis=1))

sim_values = (
    np.dot(ratings, ratings.transpose()) / 
    (l2norm * l2norm[np.newaxis].T)
)
item_sim_df = pd.DataFrame(sim_values, index=rating_matrix.columns, columns=rating_matrix.columns)

In [6]:
item_sim_df

member,Can,Cherprang,Izurina,Jaa,Jan,Jane,Jennis,Jib,Kaew,Kaimook,...,Music,Namneung,Namsai,Nink,Noey,Orn,Piam,Pun,Satchan,Tarwaan
member,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Can,1.0,0.027634,0.010673,0.027043,0.033927,0.019522,0.021931,0.011006,0.04144,0.021587,...,0.028931,0.04432,0.0384,0.013257,0.035738,0.025128,0.012719,0.03413,0.006843,0.046261
Cherprang,0.027634,1.0,0.037515,0.067094,0.161402,0.037378,0.066059,0.013933,0.156627,0.109067,...,0.28228,0.0923,0.078244,0.012351,0.10143,0.185265,0.025895,0.249634,0.023969,0.128286
Izurina,0.010673,0.037515,1.0,0.026361,0.037411,0.016193,0.016184,0.013238,0.033331,0.028479,...,0.048287,0.029449,0.023934,0.010639,0.024136,0.0286,0.018569,0.027704,0.013767,0.030038
Jaa,0.027043,0.067094,0.026361,1.0,0.097439,0.071486,0.059595,0.026486,0.113775,0.118397,...,0.102538,0.163259,0.167227,0.040588,0.100224,0.100514,0.036849,0.072928,0.026334,0.134082
Jan,0.033927,0.161402,0.037411,0.097439,1.0,0.050586,0.082087,0.020478,0.261984,0.141374,...,0.214094,0.167254,0.100709,0.023864,0.152507,0.140364,0.033563,0.175671,0.029952,0.184268
Jane,0.019522,0.037378,0.016193,0.071486,0.050586,1.0,0.028015,0.015605,0.045545,0.05055,...,0.046717,0.062986,0.065372,0.04958,0.044961,0.045761,0.024964,0.041345,0.009051,0.070521
Jennis,0.021931,0.066059,0.016184,0.059595,0.082087,0.028015,1.0,0.014299,0.090632,0.071019,...,0.086596,0.087557,0.062232,0.009142,0.07007,0.063912,0.017736,0.087634,0.015565,0.096228
Jib,0.011006,0.013933,0.013238,0.026486,0.020478,0.015605,0.014299,1.0,0.020686,0.025581,...,0.021527,0.029261,0.03017,0.020541,0.017274,0.013954,0.03814,0.013143,0.009356,0.027059
Kaew,0.04144,0.156627,0.033331,0.113775,0.261984,0.045545,0.090632,0.020686,1.0,0.157785,...,0.199538,0.198013,0.113517,0.023552,0.156947,0.172139,0.033698,0.176547,0.031583,0.242794
Kaimook,0.021587,0.109067,0.028479,0.118397,0.141374,0.05055,0.071019,0.025581,0.157785,1.0,...,0.166355,0.146993,0.127518,0.020899,0.114367,0.129975,0.033063,0.124024,0.022843,0.166804


In [7]:
item_sim_df['Jan'].sort_values(ascending=False)

member
Jan          1.000000
Kaew         0.261984
Music        0.214094
Tarwaan      0.184268
Mind         0.181907
Pun          0.175671
Namneung     0.167254
Cherprang    0.161402
Noey         0.152507
Kaimook      0.141374
Orn          0.140364
Kate         0.113338
Mobile       0.110738
Namsai       0.100709
Jaa          0.097439
Korn         0.096807
Jennis       0.082087
Jane         0.050586
Izurina      0.037411
Can          0.033927
Piam         0.033563
Satchan      0.029952
Miori        0.025922
Nink         0.023864
Maysa        0.022733
Jib          0.020478
Name: Jan, dtype: float64

### Load member images

In [8]:
# %load ../utility/member_display.py
from operator import itemgetter

import requests

IMAGE_MAX_WIDTH = 150
members = requests.get('https://www.api.bnk48.com/api/members')

member_images = {
    element['slug'].capitalize(): element['avatar_image']
    for element in members.json()['members']
}

def show_rank(ranks, n=5):
    selected_items = ranks[:n]
    max_size = max(selected_items, key=itemgetter(1))[1]

    def img_tag(name, size):
        image_size = int(size / max_size * IMAGE_MAX_WIDTH)
        return f'''<img src="{member_images[name]}" 
            alt="{name} = {size * 100}%" 
            style="width: {image_size}px; display: inline-block;"/>'''

    return ' '.join([img_tag(*member) for member in ranks[:n]])

### Top 5 Similarity of Saint'Jan

{{ show_rank(list(item_sim_df['Jan'].sort_values(ascending=False).iteritems())[1:], n=5) }}

## Top 5 Similarity to Pun

{{ show_rank(list(item_sim_df['Pun'].sort_values(ascending=False).iteritems())[1:], n=5) }}

### Rating Function

\begin{equation*}
\text{S}(u,i) = \frac{\sum_{j \in N} Sim_{i,j} r_{ui}}{\sum_{j \in N}{|Sim_{i,j}|}} \quad \text{When N is set of rated items}
\end{equation*}

In [9]:
def predict(similarity_matrix, oshi_mems, target):
    weight_vec = similarity_matrix[target]
    return weight_vec.loc[oshi_mems].sum() / (weight_vec.sum() - weight_vec.loc[target])

In [10]:
predict(item_sim_df, ['Cherprang', 'Music'], 'Jan')

0.14114359560025197

In [11]:
def recommend(similarity_matrix, oshi_mems, n=5):
    other_members = list(set(similarity_matrix.index) - set(oshi_mems))
    scores = [
        (member, predict(similarity_matrix, oshi_mems, member))
        for member in other_members
    ]
    
    return sorted(scores, key=lambda x: x[1], reverse=True)[:n]

### Recommendation
What if I like `Cherprang`, `Kaew` and `Tarwaan`

In [12]:
result = recommend(item_sim_df, {'Music', 'Kaew', 'Tarwaan'}, n=20)

{{show_rank(result, n=5)}}

# User-User

## Remove user with too few ratings

In [4]:
tan_oshis = rating_matrix[rating_matrix.sum(axis=1) == 1.].index
active_ratings = rating_matrix.drop(tan_oshis)
active_ratings

member,Can,Cherprang,Izurina,Jaa,Jan,Jane,Jennis,Jib,Kaew,Kaimook,...,Music,Namneung,Namsai,Nink,Noey,Orn,Piam,Pun,Satchan,Tarwaan
follower_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5202411,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8085222,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9366932,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9594042,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9990712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10650962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12156692,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12157412,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12791262,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12815372,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


## Normalization

Reduce effect of popular items and offer something niche but more relevant.

In [5]:
item_norms = active_ratings.sum(axis=0) ** .5
l2norm_rating = active_ratings / item_norms

l2norm_rating.head()

member,Can,Cherprang,Izurina,Jaa,Jan,Jane,Jennis,Jib,Kaew,Kaimook,...,Music,Namneung,Namsai,Nink,Noey,Orn,Piam,Pun,Satchan,Tarwaan
follower_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5202411,0.0,0.00799,0.0,0.0,0.012239,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8085222,0.0,0.0,0.014464,0.0,0.0,0.0,0.0,0.0,0.012548,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9366932,0.0,0.00799,0.0,0.0,0.0,0.0,0.016772,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9594042,0.0,0.00799,0.0,0.0,0.012239,0.0,0.016772,0.0,0.012548,0.017689,...,0.009934,0.0,0.0,0.0,0.0,0.0,0.0,0.00938,0.0,0.014947
9990712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.018337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014947


## Similarity Matrix

In [6]:
ratings = l2norm_rating.values
l2norm = np.sqrt(np.sum(ratings ** 2, axis=1))

sim_values = (
    np.dot(ratings, ratings.transpose()) / 
    (l2norm * l2norm[np.newaxis].T)
)

user_sim_df = pd.DataFrame(sim_values, index=l2norm_rating.index, columns=l2norm_rating.index)

In [7]:
user_sim_df.head()

follower_id,5202411,8085222,9366932,9594042,9990712,10650962,12156692,12157412,12791262,12815372,...,963714085018771456,963724833790967810,963728623147479040,963744840142868480,963756802687909888,963762644967555072,963773694785933312,963773913149812737,963779429502615553,963781418676404224
follower_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5202411,1.0,0.0,0.235121,0.39412,0.0,0.0,0.19604,0.484289,0.640026,0.188867,...,0.432816,0.275977,0.173299,0.354499,0.06556,0.10322,0.211607,0.275977,0.0,0.06566
8085222,0.0,1.0,0.0,0.221727,0.0,0.524875,0.490339,0.362,0.478411,0.24743,...,0.0,0.0,0.433458,0.0,0.0,0.0,0.0,0.0,0.0,0.16423
9366932,0.235121,0.0,1.0,0.500944,0.0,0.0,0.154236,0.113867,0.150484,0.240058,...,0.101764,0.217126,0.136344,0.278904,0.051579,0.081209,0.166483,0.217126,0.0,0.051659
9594042,0.39412,0.221727,0.500944,1.0,0.254647,0.422437,0.347633,0.190868,0.368771,0.479212,...,0.569465,0.426784,0.267997,0.332251,0.191802,0.301981,0.327239,0.426784,0.475826,0.192096
9990712,0.0,0.0,0.0,0.254647,1.0,0.0,0.423842,0.0,0.0,0.305691,...,0.0,0.0,0.0,0.0,0.355068,0.559034,0.0,0.0,0.535169,0.355612
