# SentenceTransformer를 이용한 문장 유사도 측정

In [1]:
### 필요한 라이브러리 설치
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m163.8/171.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [2]:
### 필요한 함수 임폴트
from sentence_transformers import SentenceTransformer

In [3]:
### 사전 학습된 모델 다운로드
model_name = 'all-MiniLM-L12-v2'
eng_model = SentenceTransformer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## 영어 유사도 측정

In [4]:
eng_sentences = ["What should I do to be a great scientist?", "How can I be a good scientist?"]

In [5]:
### 문장 임베딩 벡터 생성

# model.encode([sentence])
embeddings = eng_model.encode(eng_sentences)

# 결과 확인하기
print(f'영문 텍스트 데이터 전체의 임베딩 행렬의 모양 : {embeddings.shape}')
print('-'*80)
print(f'첫번째 문장의 임베딩 벡터 : \n{embeddings[0, :]}')
print('-'*80)
print(f'두번째 문장의 임베딩 벡터 : \n{embeddings[1, :]}')

영문 텍스트 데이터 전체의 임베딩 행렬의 모양 : (2, 384)
--------------------------------------------------------------------------------
첫번째 문장의 임베딩 벡터 : 
[ 2.22636424e-02  8.50509331e-02  7.91068748e-02 -6.84861420e-03
 -1.46627408e-02 -6.85858652e-02  2.64204293e-02  6.64388537e-02
 -4.24608178e-02 -3.75801809e-02  1.80893764e-02 -9.48860124e-02
 -4.75542322e-02  7.65434280e-02 -3.93009558e-02  2.58018803e-02
 -7.41134137e-02  5.87385967e-02  3.13342847e-02 -7.86113739e-02
 -6.38960972e-02  4.31189910e-02  6.02599280e-03  1.21796574e-03
 -5.90102114e-02  8.42712075e-02 -2.48904210e-02 -6.01979420e-02
  2.75624283e-02 -5.94473891e-02 -8.06310098e-04 -7.62991235e-02
  1.78551786e-02  2.06335634e-02 -3.27370651e-02  4.25934307e-02
  6.07814863e-02 -2.56904867e-02  5.79137728e-02  5.28666116e-02
  9.59740113e-03 -4.70351763e-02  2.61405073e-02 -9.10801464e-04
  8.14122334e-03 -1.69612952e-02  2.88743805e-02 -1.83990621e-03
  3.87182645e-02  3.14611904e-02 -5.87457940e-02 -7.69814700e-02
 -1.39820680e-01 -6

In [6]:
### 코사인 유사도 측정하기

# 필요한 함수 임폴트
from sklearn.metrics.pairwise import cosine_similarity

# 문장 임베딩 벡터(1차원) --> 2차원 배열로 변환
embedding1 = embeddings[0, :].reshape((1,384))
embedding2 = embeddings[1, :].reshape((1,384))

# 코사인 유사도 측정
eng_sim = cosine_similarity(embedding1, embedding2)
print(f'영문 텍스트 데이터의 문장 간 유사도 : {eng_sim}')

영문 텍스트 데이터의 문장 간 유사도 : [[0.8954654]]


## 한국어 유사도 측정

In [7]:
### 사전 학습된 한국어 모델 다운로드
model_name='ddobokki/klue-roberta-base-nli-sts'
kor_model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/576 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
### 한글 텍스트 데이터 생성
kor_sentences = ["직원이 무단 퇴사를 했는데 손해 배상 청구할 수 있나요?", "무단 퇴사한 직원에 대한 손해 배상 청구가 가능한가요?"]

In [15]:
kor_embeddings = kor_model.encode(kor_sentences)

print(kor_embeddings.shape)
print('-'*80)
print(kor_embeddings[0, :])
print('-'*80)
print(kor_embeddings[1, :])

(2, 768)
--------------------------------------------------------------------------------
[-4.99942362e-01 -3.59200597e-01 -7.38401562e-02  4.84219134e-01
 -4.96614248e-01  5.82403481e-01 -4.56285924e-02 -3.81824046e-01
 -3.94545138e-01  2.18352944e-01  2.40236863e-01 -6.53119981e-01
  1.17322311e-01 -4.41113621e-01 -8.18817504e-03 -2.16763169e-01
  1.42986374e-02 -9.46456194e-02  1.55718774e-01 -1.03794351e-01
 -5.95203400e-01 -5.41155457e-01 -6.22601509e-01 -1.00601888e+00
 -2.20479354e-01 -1.32809699e-01 -4.36403155e-02 -4.01162505e-02
  5.56914866e-01  7.56343603e-01  6.72921762e-02  8.65490198e-01
  4.12386581e-02  2.79036552e-01 -1.98162034e-01 -5.00127748e-02
  3.10566962e-01 -3.46614659e-01 -6.20578766e-01  6.81772888e-01
  8.81858394e-02  2.54626304e-01 -4.60292935e-01 -1.45168126e-01
 -6.55787531e-03 -2.85244584e-01  6.48564938e-03  3.51094663e-01
  9.55609307e-02 -1.56446844e-01 -1.90832332e-01 -3.73097301e-01
  2.39245102e-01  3.94673795e-02 -3.96240830e-01  8.78734291e-01


In [16]:
### 코사인 유사도 측정하기

# 필요한 함수 임폴트
from sklearn.metrics.pairwise import cosine_similarity

# 문장 임베딩 벡터(1차원) --> 2차원 배열로 변환
kor_embedding1 = kor_embeddings[0, :].reshape((1,768))
kor_embedding2 = kor_embeddings[1, :].reshape((1,768))

# 코사인 유사도 측정
kor_sim = cosine_similarity(kor_embedding1, kor_embedding2)
print(f'영문 텍스트 데이터의 문장 간 유사도 : {kor_sim}')

영문 텍스트 데이터의 문장 간 유사도 : [[0.9606154]]


## 영어 / 한글 문장 유사도 측정

In [18]:
### 다국어 지원 사전 학습된 모델 다운로드
model_name= 'paraphrase-multilingual-MiniLM-L12-v2'
multi_model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
### 텍스트 데이터 생성
mul_sentences = ['What should I do to be a great scientist?', '훌륭한 과학자가 되려면 어떻게 해야 할까요?']

In [21]:
### 문장 임베딩 행렬 생성하기

# model.encode([sentence])
multi_embeddings = multi_model.encode(mul_sentences)

# 결과 확인하기
print(multi_embeddings.shape)
print('-'*80)
print(multi_embeddings[0, :])
print('-'*80)
print(multi_embeddings[1, :])

(2, 384)
--------------------------------------------------------------------------------
[ 5.46487309e-02  4.94343251e-01  4.10707623e-01  1.27668157e-01
 -9.11793485e-02 -5.76901734e-01  1.17373504e-02  1.73615083e-01
 -3.01463813e-01 -1.61016598e-01 -2.14112625e-02 -7.54894197e-01
 -1.01586692e-01  2.17390526e-02 -2.76769996e-01  4.08535078e-03
 -1.66494504e-01 -1.53407559e-01  9.10668261e-03 -2.29964688e-01
 -4.85452741e-01  1.59717664e-01  7.86593258e-02 -1.78895816e-01
 -2.89161235e-01  2.65021116e-01 -1.87272534e-01 -2.74408787e-01
  8.98738578e-02 -1.40582860e-01  1.85039397e-02 -8.13296959e-02
  2.98330039e-01  1.22198360e-02  8.52152035e-02  4.40637112e-01
 -3.57332006e-02 -2.71989971e-01  3.55497628e-01  2.01634943e-01
  3.68284672e-01  8.60847458e-02  6.20844901e-01  1.44933268e-01
 -1.14840746e-01 -4.09185320e-01 -2.46177856e-02 -2.33240679e-01
  7.04460889e-02  2.04685375e-01 -2.15648487e-01 -1.35802686e-01
 -4.60398793e-01 -4.50309247e-01 -1.36423782e-01 -3.55098039e-01


In [23]:
### 코사인 유사도 측정하기

# 필요한 함수 임폴트
from sklearn.metrics.pairwise import cosine_similarity

# 문장 임베딩 벡터(1차원) --> 2차원 배열로 변환
multi_embedding1 = multi_embeddings[0, :].reshape((1,384))
multi_embedding2 = multi_embeddings[1, :].reshape((1,384))

# 코사인 유사도 측정
multi_sim = cosine_similarity(multi_embedding1, multi_embedding2)
print(f'영문 텍스트 데이터의 문장 간 유사도 : {multi_sim}')

영문 텍스트 데이터의 문장 간 유사도 : [[0.9493035]]


# 문장 유사도를 이용한 추천 시스템

In [26]:
'''
### 실습 개요
- dataset : 2017년 7월 또는 그 이전에 개봉된 영화 45,466편에 대한 각종 정보가
포함되어 있음
- 영화에 대한 줄거리(줄거리, 텍스트 데이터)를 CountVectorizer를 이용하여 임베딩 행렬로 변환한다.
- 좋아하는 영화의 제목을 입력한다.
- 입력한 영화의 줄거리와 유사한 줄거리를 가지는 영화 제목을 찾아서 추천해준다
### 우리에게 필요한 컬럼 : 제목(title), 줄거리(overview)
'''

# 필요한 라이브러리 임폴트
import pandas as pd

# 파일 경로 설정
file_path='/content/drive/MyDrive/KDT/딥러닝/자연어처리/movies_metadata.csv'

# pd.read_csv() --> DataFrame 생성
df_movies = pd.read_csv(file_path)

# 결과 확인하기
df_movies.head()

  df_movies = pd.read_csv(file_path)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [27]:
df_movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [36]:
df = df_movies.loc[:, ['overview', 'title']]
df

#df1 = df_movies[['overview']]
#df2 = df_movies[['title']]

#df3 = pd.concat([df1, df2], axis = 1)

#df3

Unnamed: 0,overview,title
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,When siblings Judy and Peter discover an encha...,Jumanji
2,A family wedding reignites the ancient feud be...,Grumpier Old Men
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale
4,Just when George Banks has recovered from his ...,Father of the Bride Part II
...,...,...
45461,Rising and falling between a man and woman.,Subdue
45462,An artist struggles to finish his work while a...,Century of Birthing
45463,"When one of her hits goes wrong, a professiona...",Betrayal
45464,"In a small town live two brothers, one a minis...",Satan Triumphant


In [37]:
df.isnull().sum()

overview    954
title         6
dtype: int64

In [38]:
### 누락 데이터 dropna
cleaned_df = df.dropna(ignore_index = True)

cleaned_df

Unnamed: 0,overview,title
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,When siblings Judy and Peter discover an encha...,Jumanji
2,A family wedding reignites the ancient feud be...,Grumpier Old Men
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale
4,Just when George Banks has recovered from his ...,Father of the Bride Part II
...,...,...
44501,Rising and falling between a man and woman.,Subdue
44502,An artist struggles to finish his work while a...,Century of Birthing
44503,"When one of her hits goes wrong, a professiona...",Betrayal
44504,"In a small town live two brothers, one a minis...",Satan Triumphant


In [39]:
### data의 수를 10000개로 축소 --> 슬라이싱
df_data = cleaned_df.iloc[:10000, :]

df_data

Unnamed: 0,overview,title
0,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,When siblings Judy and Peter discover an encha...,Jumanji
2,A family wedding reignites the ancient feud be...,Grumpier Old Men
3,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale
4,Just when George Banks has recovered from his ...,Father of the Bride Part II
...,...,...
9995,"In the peaceful countryside, Vassily opposes t...",Earth
9996,"Francisco is rich, rather strict on principles...",Él
9997,"Cashier Maurice Legrand is married to Adele, a...",La Chienne
9998,"While Erendira, a beautiful teenage girl, has ...",Eréndira


In [40]:
### 문장 임베딩 생성 모델 --> 사전 학습된 모델 다운로드
model_name = 'all-MiniLM-L12-v2'
model = SentenceTransformer(model_name)

In [41]:
### 문장 임베딩 행렬 생성

# overview 칼럼 --> 줄거리 --> 임베딩
sentences = df_data.loc[:, 'overview'].values
# print(sentences)

# model.encode([sentences])
embeddings = model.encode(sentences)

In [43]:
print(embeddings.shape)

(10000, 384)


In [44]:
### 문장 임베딩 행렬 저장

import numpy as np

# 문장 임베딩 행렬 --> 넘파이 배열 --> 저장 --> np.save(file_path/.npy, arr)
file_path = '/content/drive/MyDrive/KDT/딥러닝/자연어처리/embeddings.npy'

np.save(file_path, embeddings)

In [45]:
np_embeddings = np.load(file_path)
np_embeddings

array([[ 0.01415948,  0.07435112,  0.09298418, ...,  0.07698087,
         0.04534296,  0.03181639],
       [ 0.07520459, -0.04395673,  0.01294587, ...,  0.07619377,
        -0.03279951, -0.02076748],
       [ 0.04138837,  0.0662863 ,  0.02839824, ...,  0.01836856,
        -0.02601483,  0.01890559],
       ...,
       [-0.09879992, -0.00235099, -0.0348098 , ..., -0.03842691,
         0.04843282, -0.01128102],
       [-0.06564485,  0.06447928,  0.01064611, ...,  0.04522283,
         0.0044388 , -0.03518389],
       [-0.05290857,  0.03819921,  0.06768037, ...,  0.01148303,
         0.04201176, -0.08453614]], dtype=float32)

In [46]:
### 코사인 유사도 계산 --> cosine_similarity(X, Y)

# 10000개의 영화 줄거리 각각에 대해서 코사인 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(embeddings, embeddings)
# 결과 확인하기
print(f'유사도 측정의 결과 : \n{sim}')

유사도 측정의 결과 : 
[[1.0000005  0.33839947 0.12692967 ... 0.12217258 0.17416471 0.06714304]
 [0.33839947 1.0000004  0.26377776 ... 0.16545892 0.3252555  0.16340765]
 [0.12692967 0.26377776 1.         ... 0.24624076 0.27974683 0.14819568]
 ...
 [0.12217258 0.16545892 0.24624076 ... 1.0000004  0.21591777 0.04777016]
 [0.17416471 0.3252555  0.27974683 ... 0.21591777 1.         0.18749897]
 [0.06714304 0.16340765 0.14819568 ... 0.04777016 0.18749897 1.        ]]


In [47]:
### 코사인 유사도 --> 데이터프레임 생성

# 칼럼과 인덱스 설정
columns = df_data.loc[:, 'title'].values
index = df_data.loc[:, 'title'].values

# 데이터 프레임 생성
df_sim = pd.DataFrame(data = sim, index = index, columns = columns)
df_sim

Unnamed: 0,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Between Your Legs,Zatôichi Meets the One-Armed Swordsman,The Angry Silence,Isadora,San Francisco,Earth,Él,La Chienne,Eréndira,The Private Lives of Elizabeth and Essex
Toy Story,1.000000,0.338399,0.126930,0.175748,0.137818,0.224320,0.183411,0.304267,0.143974,0.111092,...,0.204337,0.152693,0.093307,0.089167,0.061069,0.035998,0.183392,0.122173,0.174165,0.067143
Jumanji,0.338399,1.000000,0.263778,0.205652,0.144077,0.294304,0.253075,0.397237,0.254173,0.167929,...,0.187563,0.154846,-0.012331,0.034107,0.052756,-0.014982,0.177061,0.165459,0.325256,0.163408
Grumpier Old Men,0.126930,0.263778,1.000000,0.360031,0.175603,0.211339,0.314122,0.291589,0.186269,0.203596,...,0.301054,0.278452,-0.001206,0.061602,0.210004,0.081070,0.420658,0.246241,0.279747,0.148196
Waiting to Exhale,0.175748,0.205652,0.360031,1.000000,0.204694,0.286783,0.332937,0.295825,0.280217,0.182333,...,0.448264,0.279091,0.003809,0.087916,0.077089,0.156300,0.390910,0.197444,0.306730,0.167567
Father of the Bride Part II,0.137818,0.144077,0.175603,0.204694,1.000000,0.089654,0.117296,0.065354,0.216099,0.111157,...,0.225443,0.093269,-0.020158,0.064023,0.096882,0.023103,0.253434,0.131023,0.240259,0.088746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Earth,0.035998,-0.014982,0.081070,0.156300,0.023103,-0.033135,0.104973,0.045792,0.025830,0.041411,...,0.022366,0.119633,0.161267,0.051231,0.043433,1.000000,0.102056,0.001478,0.113178,-0.004952
Él,0.183392,0.177061,0.420658,0.390910,0.253434,0.247657,0.311852,0.195511,0.231736,0.145004,...,0.483851,0.272619,0.078805,0.182015,0.235567,0.102056,1.000000,0.329538,0.309063,0.172004
La Chienne,0.122173,0.165459,0.246241,0.197444,0.131023,0.212082,0.328271,0.201282,0.177863,0.131936,...,0.350755,0.240732,0.101820,0.196731,0.059917,0.001478,0.329538,1.000000,0.215918,0.047770
Eréndira,0.174165,0.325256,0.279747,0.306730,0.240259,0.258495,0.177591,0.255010,0.347747,0.144574,...,0.330732,0.329395,0.094023,0.309069,0.138785,0.113178,0.309063,0.215918,1.000000,0.187499


In [57]:
### 특정 영화기준 --> 줄거리가 유사한 영화 추천 함수 생성
"""
1. 유사도 크기 순으로 정렬
2. 해당 영화의 유사도 제거
3. 특정 영화와 다른 영화와의 유사도 크기 순으로 n개만 추출
"""

# 추천 함수 정의
def recommend(title, k):
    top_k = df_sim.loc[:, title].sort_values(ascending = False).iloc[1:k+1]
    return top_k

In [58]:
df_sim.columns

Index(['Toy Story', 'Jumanji', 'Grumpier Old Men', 'Waiting to Exhale',
       'Father of the Bride Part II', 'Heat', 'Sabrina', 'Tom and Huck',
       'Sudden Death', 'GoldenEye',
       ...
       'Between Your Legs', 'Zatôichi Meets the One-Armed Swordsman',
       'The Angry Silence', 'Isadora', 'San Francisco', 'Earth', 'Él',
       'La Chienne', 'Eréndira', 'The Private Lives of Elizabeth and Essex'],
      dtype='object', length=10000)

In [62]:
top10 = recommend(title = 'Jumanji', k = 10)
print(top10)

Flowers in the Attic     0.534558
Eight Days a Week        0.532829
Pete's Dragon            0.520785
Thir13en Ghosts          0.513867
Peter Pan                0.509830
Panic Room               0.508888
Labyrinth                0.486888
The Glass House          0.486347
The Boy Who Could Fly    0.472243
Amityville: Dollhouse    0.471381
Name: Jumanji, dtype: float32
