# 21-review-description-similarity

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer

In [5]:
sentences = ["That is a happy person", "That is a happy dog", "That is a very happy person", "Today is a sunny day"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)
print(embeddings.shape)

[[-0.03387689  0.09194157  0.04870129 ... -0.0143926  -0.02754976
   0.04475824]
 [ 0.00504993  0.0631698   0.01415723 ...  0.04035439  0.07584123
   0.09087349]
 [-0.00248318  0.09151708  0.04838626 ... -0.02641118 -0.07529834
   0.02803212]
 [-0.01629126  0.1040661   0.0974078  ...  0.00676726 -0.08788456
   0.03404383]]
(4, 384)


In [7]:
np.dot(embeddings[0, :], embeddings[1, :])/(np.linalg.norm(embeddings[0, :])*np.linalg.norm(embeddings[1, :]))

0.69457746

In [8]:
np.dot(embeddings[0, :], embeddings[2, :])/(np.linalg.norm(embeddings[0, :])*np.linalg.norm(embeddings[2, :]))

0.942915

In [9]:
np.dot(embeddings[0, :], embeddings[3, :])/(np.linalg.norm(embeddings[0, :])*np.linalg.norm(embeddings[3, :]))

0.25687608

## Read in main data

In [16]:
main = pd.read_csv('Data/main_211110.csv', dtype=str)
main

Unnamed: 0.1,Unnamed: 0,id,review,review_1st,review_2nd,original_title,year,date_published,duration,country,...,popularity,poster_path,revenue,runtime,status,video,vote_count,genre_1,genre_2,genre_3
0,0,tt0018515,"I admit, the great majority of films released ...",1.6660906,1.7079178,Two Arabian Knights,1927,1927-09-23,92,USA,...,0.38981,/mGZ3EgmkDesKEA8FnJcUgX8TpXT.jpg,0.0,92.0,Released,False,0.0,Adventure,Comedy,Romance
1,1,tt0018515,This is a very strange film that was long thou...,2.6453724,5.547905,Two Arabian Knights,1927,1927-09-23,92,USA,...,0.38981,/mGZ3EgmkDesKEA8FnJcUgX8TpXT.jpg,0.0,92.0,Released,False,0.0,Adventure,Comedy,Romance
2,2,tt0018515,"William Boyd and Louis Wolheim are the ""Two Ar...",-0.4441953,2.2827504,Two Arabian Knights,1927,1927-09-23,92,USA,...,0.38981,/mGZ3EgmkDesKEA8FnJcUgX8TpXT.jpg,0.0,92.0,Released,False,0.0,Adventure,Comedy,Romance
3,3,tt0018515,Not very many movies come to my mind that cove...,0.8150438,3.9238899,Two Arabian Knights,1927,1927-09-23,92,USA,...,0.38981,/mGZ3EgmkDesKEA8FnJcUgX8TpXT.jpg,0.0,92.0,Released,False,0.0,Adventure,Comedy,Romance
4,4,tt0018515,"The third movie produced by Howard Hughes, thi...",2.477075,5.466526,Two Arabian Knights,1927,1927-09-23,92,USA,...,0.38981,/mGZ3EgmkDesKEA8FnJcUgX8TpXT.jpg,0.0,92.0,Released,False,0.0,Adventure,Comedy,Romance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28624,28624,tt0492486,This was the surprise film at the Dublin Horro...,1.0869972,6.843044,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",...,9.128571,/hPzqg3NTDaW6y7L0QCBxMxhOQjK.jpg,4910682.0,84.0,Released,False,99.0,Comedy,Horror,Mystery
28625,28625,tt0492486,I've seen hundreds of horror movies in my life...,0.17316706,6.62107,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",...,9.128571,/hPzqg3NTDaW6y7L0QCBxMxhOQjK.jpg,4910682.0,84.0,Released,False,99.0,Comedy,Horror,Mystery
28626,28626,tt0492486,I cannot remember the last time a horror movie...,1.0682999,6.835441,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",...,9.128571,/hPzqg3NTDaW6y7L0QCBxMxhOQjK.jpg,4910682.0,84.0,Released,False,99.0,Comedy,Horror,Mystery
28627,28627,tt0492486,I saw this movie tonight. Never have left comm...,1.0295599,3.5069268,Shrooms,2007,2008-08-22,84,"Ireland, UK, Denmark",...,9.128571,/hPzqg3NTDaW6y7L0QCBxMxhOQjK.jpg,4910682.0,84.0,Released,False,99.0,Comedy,Horror,Mystery


## Convert review and description into list for models

In [12]:
reviews = main["review"].tolist()
reviews[:2]

descriptions = main["description"].tolist()
descriptions[:2]

["Two American soldiers are captured by the Germans on the Western Front during World War One and escape a POW camp only to stumble into further life-threatening adventures when they come across an Arabian king's daughter while on the lam.",
 "Two American soldiers are captured by the Germans on the Western Front during World War One and escape a POW camp only to stumble into further life-threatening adventures when they come across an Arabian king's daughter while on the lam."]

## Create embeddings and calculate cosine similarity scores

In [14]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
review_embeddings = model.encode(reviews)
print(review_embeddings)
print(review_embeddings.shape)

[[-1.29428646e-02 -4.24936302e-02 -6.24601431e-02 ... -6.57661539e-03
  -3.60753760e-02  1.08164597e-04]
 [-4.81599569e-02 -1.31846825e-02 -8.91327187e-02 ...  9.85455699e-03
   4.05939184e-02 -3.94534245e-02]
 [-3.16884592e-02  6.80444613e-02 -6.71040490e-02 ... -3.60574275e-02
   1.25394762e-02 -3.84959094e-02]
 ...
 [ 2.71297917e-02  6.61117956e-03 -4.27735224e-02 ... -9.85558610e-03
   2.93865539e-02 -1.57187469e-02]
 [-3.62882689e-02  3.66996191e-02 -4.57762443e-02 ... -1.16633311e-01
   9.69860982e-03 -1.57413296e-02]
 [ 6.58605173e-02 -1.26149980e-02  1.59819033e-02 ... -3.98750715e-02
  -4.47551385e-02  7.71743432e-03]]
(28629, 384)


### calculate three first maximum cosine similarity score movie for each one

In [None]:
mid = main['id'].tolist()
max_movie_id = []
for i in range(review_embeddings.shape[0]):
    cur_id = mid[i]
    cosine = {}
    for j in range(review_embeddings.shape[0]):
        if j != i and mid[j] != cur_id:
            cosine[j]=np.dot(review_embeddings[i, :], review_embeddings[j, :])/(np.linalg.norm(review_embeddings[i, :])*np.linalg.norm(review_embeddings[j, :]))
    sorted_cosine = dict(sorted(cosine.items(), key=lambda item: item[1], reverse = True))
    index = list(sorted_cosine.keys())[:3]
    max_id = [mid[q] for q in index]
    max_movie_id.append(max_id)

In [None]:
max_movie_id