# 영화 리뷰 워드 임베딩 (Word2Vec, FastText)
- gensim 라이브러리 사용 : pip install gensim
    - Word2Vec : models.Word2Vec
    - FastText : models.FastText

## 1. 데이터 준비
* 토큰화가 잘 되어 있는 filtered 데이터 사용

In [5]:
data_filename = '../data/Korean_movie_reviews_2016_filtered.csv'
import pandas as pd 
review_df= pd.read_csv(data_filename)
review_df.head()

Unnamed: 0,review,rate
0,아니 딴 그렇 비 비탄 총 대체 왜 들 온겨,7
1,진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임,1
2,역대 좀비 영화 가장 최고다 원작 만화 읽어 보려 영화 보고 결정 하려 감독 간츠 ...,10
3,온종일 불편한 피 범벅 일,6
4,답답함 극치 움직일 잇으 좀 움직여 어지간히 좀비 봣으 얼 타고 때려 잡 때 되 않냐,1


In [18]:
# review만 모아서 review별 토큰 리스트로 변환 : review가 Object 타입이므로 str로 변환 후 split
review_df.info()
review_list=list(map(str,review_df.review))
corpus=[review.split() for review in review_list]
corpus[:5]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788189 entries, 0 to 788188
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   review  785448 non-null  object
 1   rate    788189 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 12.0+ MB


[['아니', '딴', '그렇', '비', '비탄', '총', '대체', '왜', '들', '온겨'],
 ['진심',
  '쓰레기',
  '영화',
  '만들',
  '무서',
  '알',
  '쫄아',
  '틀었',
  '이건',
  '뭐',
  '웃',
  '거리',
  '없는',
  '쓰레기',
  '영화',
  '임'],
 ['역대',
  '좀비',
  '영화',
  '가장',
  '최고다',
  '원작',
  '만화',
  '읽어',
  '보려',
  '영화',
  '보고',
  '결정',
  '하려',
  '감독',
  '간츠',
  '실사',
  '했',
  '사람',
  '거르려',
  '그냥',
  '봤',
  '정말',
  '흠잡',
  '없는',
  '최고',
  '좀비',
  '영화',
  '잔인',
  '거',
  '싫어하지',
  '참고',
  '볼',
  '만하',
  '로미',
  '인물',
  '왜',
  '그런',
  '모르'],
 ['온종일', '불편한', '피', '범벅', '일'],
 ['답답함',
  '극치',
  '움직일',
  '잇으',
  '좀',
  '움직여',
  '어지간히',
  '좀비',
  '봣으',
  '얼',
  '타고',
  '때려',
  '잡',
  '때',
  '되',
  '않냐']]

## 1. Word2Vec 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/word2vec.html

### Skipgram, negative=10 인 경우

In [22]:
# Word2Vec 모델 생성 및 학습 : window=3, min_count=3
from gensim.models import Word2Vec
model_sg_m10=Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=10)
#sg가 1인지 0인지를 확인 

In [23]:
# 단어의 임베딩 벡터 확인
model_sg_m10.wv['이정재']

array([-4.7012582e-01,  2.6367295e-01,  9.9382542e-02, -5.6317753e-01,
       -3.6657602e-01, -4.1418737e-01,  5.3871346e-01,  6.5753686e-01,
        9.4478214e-01, -9.6139625e-02, -2.8443629e-01, -3.0801061e-01,
       -2.8591841e-01,  3.6275294e-01, -1.5072870e-01,  1.4965364e-01,
        5.3161573e-02, -2.6561943e-01,  2.3318402e-01,  2.5882459e-01,
       -2.2267360e-02,  1.4629687e-01,  3.2515359e-01,  4.4151172e-01,
       -4.0945348e-01,  1.6047285e-01,  4.7002223e-01,  3.2795343e-01,
        1.3113867e-01, -1.5237383e-02, -1.9800594e-01, -4.0970597e-04,
       -6.1866652e-02, -1.1639177e-01,  1.9419508e-01, -8.4562957e-01,
       -1.0487810e-01, -1.5343009e-01, -4.7345397e-01, -6.7820400e-02,
       -5.1316100e-01,  3.0496994e-01, -2.6432544e-01, -1.3493682e-01,
        2.3854634e-01,  4.0924683e-01, -6.6668026e-02,  2.2045647e-01,
       -1.7964131e-01,  3.3616234e-02, -8.0489136e-02, -3.2818547e-01,
       -3.6803973e-01, -2.8121307e-01,  3.1479853e-01,  7.9821862e-02,
      

In [24]:
# 단어의 임베딩 벡터 차원 확인
len(model_sg_m10.wv['이정재'])
#100차원인지 

100

In [25]:
# 두 단어 간 유사도 확인
model_sg_m10.wv.similarity('이정재','정우성')
#코사인 유사도와 같은거임 

0.75731516

In [26]:
# 특정 단어와 유사한 단어 추출
model_sg_m10.wv.most_similar('이정재', topn=20)

[('송강호', 0.8452677726745605),
 ('공유', 0.829178512096405),
 ('이범수', 0.7960792779922485),
 ('김범수', 0.7740590572357178),
 ('정우성', 0.7573151588439941),
 ('김남길', 0.7568630576133728),
 ('조재현', 0.7394642233848572),
 ('이병헌', 0.7363982200622559),
 ('김윤석', 0.7357742786407471),
 ('박해일', 0.7330920100212097),
 ('이성민', 0.7273959517478943),
 ('김명민', 0.724213182926178),
 ('리암', 0.7221159338951111),
 ('조정석', 0.7211102247238159),
 ('주지훈', 0.7201303243637085),
 ('마동석', 0.7196614146232605),
 ('황정민', 0.7167661190032959),
 ('요한', 0.710929811000824),
 ('이진욱', 0.7030999660491943),
 ('김성균', 0.7026024460792542)]

In [28]:
model_sg_m10.wv.most_similar('재밌', topn=20)
#feature가 앞뒤로 쓰이는것으로 가깝게 보이기때문에 이렇게 하는게 좋음 
#재밌말고 이상한 단어도 많이나옴

[('재미있', 0.8948401808738708),
 ('재밋음', 0.8238450288772583),
 ('재밌네', 0.8215804696083069),
 ('재밌었', 0.8158237338066101),
 ('잼남', 0.80734783411026),
 ('재밌어', 0.8033546805381775),
 ('재밋엇음', 0.7843348383903503),
 ('재밋어용', 0.7796995639801025),
 ('재밋엇', 0.7761644124984741),
 ('쟈밋', 0.7721275091171265),
 ('엇', 0.7721215486526489),
 ('재밋엇어용', 0.7713251113891602),
 ('잼슴', 0.7704072594642639),
 ('재밋었음', 0.7633653283119202),
 ('재밋습니', 0.7633296847343445),
 ('재밌슴', 0.7627429962158203),
 ('재밌드', 0.7622578740119934),
 ('재밋는듯', 0.7587648630142212),
 ('재밋구', 0.7563505172729492),
 ('재밋게봣', 0.7553620338439941)]

### Skipgram, negative=5 인 경우

In [29]:
# 모델 생성
from gensim.models import Word2Vec
model_sg_m11=Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=5)

In [30]:
# 특어 단어와 유사한 단어 추출 : 이정재
model_sg_m11.wv.most_similar('이정재', topn=20)

[('이범수', 0.8303846716880798),
 ('송강호', 0.8211636543273926),
 ('공유', 0.7895262241363525),
 ('김범수', 0.740130603313446),
 ('이성민', 0.7400224804878235),
 ('김남길', 0.7373794913291931),
 ('송광호', 0.7195315361022949),
 ('정우성', 0.7160460948944092),
 ('이진욱', 0.7115742564201355),
 ('이병헌', 0.7082733511924744),
 ('마동석', 0.7074337601661682),
 ('주지훈', 0.7011495232582092),
 ('김명민', 0.7001257538795471),
 ('박해일', 0.6994004249572754),
 ('김성균', 0.6966407299041748),
 ('리암', 0.6894350647926331),
 ('김윤석', 0.6888411641120911),
 ('조재현', 0.688133180141449),
 ('리슨', 0.683452308177948),
 ('곽도원', 0.683199942111969)]

In [31]:
# 특어 단어와 유사한 단어 추출 : 재밌
model_sg_m11.wv.most_similar('재밌', topn=20)

[('재미있', 0.8972207307815552),
 ('재밌네', 0.8325872421264648),
 ('잼남', 0.8307830691337585),
 ('재밋음', 0.8301081657409668),
 ('재밌어', 0.8229067325592041),
 ('재밌었', 0.8228879570960999),
 ('재밋어용', 0.7854608297348022),
 ('쟈밋', 0.7840077877044678),
 ('재밋엇음', 0.7797922492027283),
 ('재밋었습니', 0.7733750939369202),
 ('재밋었음', 0.772960901260376),
 ('잼슴', 0.772779643535614),
 ('재미있었', 0.7700456380844116),
 ('재밋네용', 0.7663669586181641),
 ('재밋어', 0.7655197381973267),
 ('재밌아', 0.7641509175300598),
 ('재밋네', 0.7639943361282349),
 ('재밋엇어용', 0.7620124220848083),
 ('재밋습니', 0.7598965764045715),
 ('재밋구', 0.7587760090827942)]

### CBOW, negative=10 인 경우

In [32]:
model_sg_c1=Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=0, negative=10)
#sg=1 skip-Gram, sg=0 CBOW


In [34]:
model_sg_c1.wv.most_similar('이정재',topn=20)

[('김윤석', 0.794257402420044),
 ('이범수', 0.7922739386558533),
 ('조재현', 0.7467256188392639),
 ('이성민', 0.7297304272651672),
 ('송강호', 0.7295792698860168),
 ('김범수', 0.7288150787353516),
 ('공유', 0.7261736392974854),
 ('이진욱', 0.7156908512115479),
 ('주지훈', 0.7117321491241455),
 ('김남길', 0.705185055732727),
 ('김성오', 0.678199291229248),
 ('차승원', 0.6752227544784546),
 ('박해일', 0.6664124727249146),
 ('민호', 0.6658392548561096),
 ('이병헌', 0.6652021408081055),
 ('곽도원', 0.6631463170051575),
 ('마동석', 0.6628413796424866),
 ('하정우', 0.6620701551437378),
 ('톰하디', 0.658588707447052),
 ('최태준', 0.6563006639480591)]

In [35]:
model_sg_c1.wv.most_similar('재밌',topn=20)

[('재미있', 0.8859228491783142),
 ('재밌네', 0.8155606389045715),
 ('재밌어', 0.8024884462356567),
 ('재밋음', 0.8014519214630127),
 ('재밌었', 0.7821163535118103),
 ('재밌는', 0.7343965172767639),
 ('재밋어', 0.7116097807884216),
 ('재미있었', 0.7085869312286377),
 ('재밌더', 0.7054029703140259),
 ('재미있네', 0.7025340795516968),
 ('잼남', 0.698547899723053),
 ('재밋엇어', 0.6902064681053162),
 ('재미있어', 0.6876171231269836),
 ('재밌던', 0.6780270934104919),
 ('재밋', 0.6712715029716492),
 ('재밋네', 0.6701642274856567),
 ('재밌고', 0.6426544189453125),
 ('재밌다', 0.641755998134613),
 ('재미있는', 0.6314316987991333),
 ('재밌구', 0.6305357813835144)]

### CBOW, negative=5 인 경우

In [36]:
model_sg_c2=Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=0, negative=5)

In [37]:
model_sg_c2.wv.most_similar('이정재',topn=20)

[('이범수', 0.7512829303741455),
 ('공유', 0.7494287490844727),
 ('송강호', 0.7276856303215027),
 ('김윤석', 0.7108481526374817),
 ('조재현', 0.7094298005104065),
 ('김범수', 0.6935282349586487),
 ('주지훈', 0.6826339364051819),
 ('이병헌', 0.6819082498550415),
 ('이성민', 0.6688457727432251),
 ('김성오', 0.6629663109779358),
 ('박해일', 0.661796510219574),
 ('이진욱', 0.6490702629089355),
 ('요한', 0.6471355557441711),
 ('김남길', 0.6394618153572083),
 ('황정민', 0.6319329142570496),
 ('하정우', 0.6302791237831116),
 ('송광호', 0.6301818490028381),
 ('차승원', 0.6282170414924622),
 ('마동석', 0.6280471682548523),
 ('곽도원', 0.6277230978012085)]

In [38]:
model_sg_c2.wv.most_similar('재밌',topn=20)

[('재미있', 0.8917093873023987),
 ('재밌네', 0.8134878873825073),
 ('재밌어', 0.801764726638794),
 ('재밋음', 0.798936665058136),
 ('재밌었', 0.7944589257240295),
 ('재미있었', 0.7230113744735718),
 ('재밌는', 0.7212602496147156),
 ('재미있네', 0.7194743156433105),
 ('재밋엇어', 0.7069931626319885),
 ('재밌더', 0.6937555074691772),
 ('재밋네', 0.6932879090309143),
 ('재밋어', 0.6930973529815674),
 ('꿀잼', 0.6856180429458618),
 ('잼남', 0.6833381652832031),
 ('재미있어', 0.6791792511940002),
 ('재밌던', 0.6709412932395935),
 ('재밋', 0.6666939854621887),
 ('재밌다', 0.6664193868637085),
 ('재밌고', 0.6459708213806152),
 ('재밋엇', 0.6427157521247864)]

### OOV(Out of Vocabulary) 문제

In [39]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화' in model_sg_m10.wv.key_to_index
#in 있다 없다 

False

In [40]:
# corpus에 없는 단어의 임베딩 벡터 확인 
model_sg_m10.wv['우주평화']
#키값이 없어서  오류 뜸

KeyError: "Key '우주평화' not present"

## 2. FastText 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/fasttext.html

In [41]:
# FastText 모델 생성 및 학습
# window=3, min_count=3, min_n=2, max_n=2
from gensim.models import FastText

ft_model = FastText(corpus, window=3, min_count=3, min_n=2, max_n=2, vector_size=100, negative=10, sg=1)

In [44]:
# 특정 단어와 유사한 단어 추출 : 이정재
ft_model.wv['이정재']

array([-0.10420658,  0.4782547 , -0.0734222 , -0.18539152,  0.06711324,
       -0.29417127, -0.30353355,  0.43321753,  0.6887632 ,  0.14565966,
       -0.20765463,  0.08554256, -0.3152085 , -0.05068115, -0.29658407,
       -0.03177863,  0.00529682, -0.35627213,  0.36520293, -0.26118702,
        0.3141169 , -0.28386214,  0.03805301, -0.23601897, -0.11212411,
       -0.03757145, -0.16104628,  0.08161667, -0.7165781 , -0.27323088,
        0.07186786, -0.6342844 ,  0.15965705, -0.15238477, -0.13091564,
       -0.34543055,  0.22068958, -0.41317925, -0.23255846,  0.201599  ,
       -0.29128054,  0.11306324, -0.22710022, -0.16785064, -0.6844328 ,
       -0.29341656, -0.4112219 , -0.25650352, -0.01918911, -0.27601856,
       -0.15809044,  0.01337032, -0.18170004,  0.07502346,  0.01185668,
       -0.04456443,  0.40374637, -0.07583304, -0.10619712,  0.21640603,
       -0.12027104, -0.3013299 , -0.09689124,  0.29121858, -0.10283736,
       -0.39640847,  0.19199026, -0.14037311, -0.05883787, -0.34

In [51]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화'in ft_model.wv.key_to_index

False

In [50]:
# corpus에 없는 단어의 임베딩 벡터 확인 
ft_model.wv['이정재']

array([-0.10420658,  0.4782547 , -0.0734222 , -0.18539152,  0.06711324,
       -0.29417127, -0.30353355,  0.43321753,  0.6887632 ,  0.14565966,
       -0.20765463,  0.08554256, -0.3152085 , -0.05068115, -0.29658407,
       -0.03177863,  0.00529682, -0.35627213,  0.36520293, -0.26118702,
        0.3141169 , -0.28386214,  0.03805301, -0.23601897, -0.11212411,
       -0.03757145, -0.16104628,  0.08161667, -0.7165781 , -0.27323088,
        0.07186786, -0.6342844 ,  0.15965705, -0.15238477, -0.13091564,
       -0.34543055,  0.22068958, -0.41317925, -0.23255846,  0.201599  ,
       -0.29128054,  0.11306324, -0.22710022, -0.16785064, -0.6844328 ,
       -0.29341656, -0.4112219 , -0.25650352, -0.01918911, -0.27601856,
       -0.15809044,  0.01337032, -0.18170004,  0.07502346,  0.01185668,
       -0.04456443,  0.40374637, -0.07583304, -0.10619712,  0.21640603,
       -0.12027104, -0.3013299 , -0.09689124,  0.29121858, -0.10283736,
       -0.39640847,  0.19199026, -0.14037311, -0.05883787, -0.34

In [49]:
# corpus에 없는 단어와 유사한 단어추출 
ft_model.wv.most_similar('우주평화')

#같이 많이 쓰는 단어관련된 앱을 만들때 사용 임베딩 
#keras에 layer에 등록 impadding 패턴을 학 하면 corpus기반으로 할 수 있음 
# 그거에 맞쳐서 임패딩 배터를 만드는 것이다. #임패딩동작 방법을 한번 생각해보기 
# 긍정 시각화 

[('우주', 0.8180536031723022),
 ('우주비행사', 0.8124653100967407),
 ('평화', 0.7982617616653442),
 ('우장', 0.7980755567550659),
 ('우방', 0.7856310606002808),
 ('쉘', 0.7818641662597656),
 ('지구대', 0.7791106700897217),
 ('계몽', 0.7779002785682678),
 ('아우슈비츠', 0.7771241068840027),
 ('켤', 0.7759838700294495)]