<a href="https://colab.research.google.com/github/tnaka78/mlcode/blob/master/cosine_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementation of cosine similarity

In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf

## Matrix definition

In [0]:
matrix = np.array(
    [[0.9, 0.8, 0.7, 0.1, 0.2, 0.3],
     [0.8, 0.7, 0.6, 0.2, 0.3, 0.4],
     [0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
     [0.2, 0.3, 0.4, 0.6, 0.7, 0.8],
     [0.1, 0.2, 0.3, 0.7, 0.8, 0.9]]
)

## scikit-learn
Use [sklearn.metrics.pairwise.cosine_similarity](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html).

In [3]:
cosine_similarity(matrix)

array([[1.        , 0.98744277, 0.84920778, 0.59246566, 0.46153846],
       [0.98744277, 1.        , 0.91798509, 0.70786517, 0.59246566],
       [0.84920778, 0.91798509, 1.        , 0.91798509, 0.84920778],
       [0.59246566, 0.70786517, 0.91798509, 1.        , 0.98744277],
       [0.46153846, 0.59246566, 0.84920778, 0.98744277, 1.        ]])

## numpy

### numpy implementation (1)
 1. Devide the matrix by L2-norm (L2 normalization).
 2. Calculate dot product of the normalized matrix.

In [4]:
matrix_norm = matrix / np.linalg.norm(matrix, ord=2, axis=1, keepdims=True)
np.dot(matrix_norm, matrix_norm.T)

array([[1.        , 0.98744277, 0.84920778, 0.59246566, 0.46153846],
       [0.98744277, 1.        , 0.91798509, 0.70786517, 0.59246566],
       [0.84920778, 0.91798509, 1.        , 0.91798509, 0.84920778],
       [0.59246566, 0.70786517, 0.91798509, 1.        , 0.98744277],
       [0.46153846, 0.59246566, 0.84920778, 0.98744277, 1.        ]])

### numpy implementation (2)
 1. Calculate dot product of the matrix.
 2. Devide it by dot product of L2-norm of the matrix.

In [5]:
norm = np.linalg.norm(matrix, ord=2, axis=1, keepdims=True)
np.dot(matrix, matrix.T) / (np.dot(norm, norm.T))

array([[1.        , 0.98744277, 0.84920778, 0.59246566, 0.46153846],
       [0.98744277, 1.        , 0.91798509, 0.70786517, 0.59246566],
       [0.84920778, 0.91798509, 1.        , 0.91798509, 0.84920778],
       [0.59246566, 0.70786517, 0.91798509, 1.        , 0.98744277],
       [0.46153846, 0.59246566, 0.84920778, 0.98744277, 1.        ]])

## TensorFlow

### setup

In [0]:
if not tf.executing_eagerly():
  tf.enable_eager_execution()

### TensorFlow implementation (1)
 1. Normalize the matrix (L2 normalization).
 2. Calculate dot product of the normalized matrix.

In [7]:
norm_matrix = tf.nn.l2_normalize(matrix, axis=1)
tf.matmul(norm_matrix, norm_matrix, transpose_b=True).numpy()

array([[1.        , 0.98744277, 0.84920778, 0.59246566, 0.46153846],
       [0.98744277, 1.        , 0.91798509, 0.70786517, 0.59246566],
       [0.84920778, 0.91798509, 1.        , 0.91798509, 0.84920778],
       [0.59246566, 0.70786517, 0.91798509, 1.        , 0.98744277],
       [0.46153846, 0.59246566, 0.84920778, 0.98744277, 1.        ]])

### TensorFlow implementation (2)
 1. Calculate dot product of the matrix.
 2. Devide it by dot product of L2-norm of the matrix.

In [8]:
norm = tf.norm(matrix, ord=2, axis=1, keepdims=True)
tf.matmul(matrix, matrix, transpose_b=True) / tf.matmul(norm, norm, transpose_b=True).numpy()

<tf.Tensor: id=19, shape=(5, 5), dtype=float64, numpy=
array([[1.        , 0.98744277, 0.84920778, 0.59246566, 0.46153846],
       [0.98744277, 1.        , 0.91798509, 0.70786517, 0.59246566],
       [0.84920778, 0.91798509, 1.        , 0.91798509, 0.84920778],
       [0.59246566, 0.70786517, 0.91798509, 1.        , 0.98744277],
       [0.46153846, 0.59246566, 0.84920778, 0.98744277, 1.        ]])>