# 영화 리뷰 워드 임베딩 (Word2Vec, FastText)
- gensim 라이브러리 사용 : pip install gensim
    - Word2Vec : models.Word2Vec
    - FastText : models.FastText

## 1. 데이터 준비
* 토큰화가 잘 되어 있는 filtered 데이터 사용

In [1]:
data_filename = './data/Korean_movie_reviews_2016_filtered.csv'
import pandas as pd
review_df = pd.read_csv(data_filename)
review_df.head()

Unnamed: 0,review,rate
0,아니 딴 그렇 비 비탄 총 대체 왜 들 온겨,7
1,진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임,1
2,역대 좀비 영화 가장 최고다 원작 만화 읽어 보려 영화 보고 결정 하려 감독 간츠 ...,10
3,온종일 불편한 피 범벅 일,6
4,답답함 극치 움직일 잇으 좀 움직여 어지간히 좀비 봣으 얼 타고 때려 잡 때 되 않냐,1


In [2]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788189 entries, 0 to 788188
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   review  785448 non-null  object
 1   rate    788189 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 12.0+ MB


In [3]:
# review만 모아서 review별 토큰 리스트로 변환 : review가 Object 타입이므로 str로 변환 후 split
review_list = list(map(str,review_df.review))
corpus = [review.split() for review in review_list]
corpus[:5]

[['아니', '딴', '그렇', '비', '비탄', '총', '대체', '왜', '들', '온겨'],
 ['진심',
  '쓰레기',
  '영화',
  '만들',
  '무서',
  '알',
  '쫄아',
  '틀었',
  '이건',
  '뭐',
  '웃',
  '거리',
  '없는',
  '쓰레기',
  '영화',
  '임'],
 ['역대',
  '좀비',
  '영화',
  '가장',
  '최고다',
  '원작',
  '만화',
  '읽어',
  '보려',
  '영화',
  '보고',
  '결정',
  '하려',
  '감독',
  '간츠',
  '실사',
  '했',
  '사람',
  '거르려',
  '그냥',
  '봤',
  '정말',
  '흠잡',
  '없는',
  '최고',
  '좀비',
  '영화',
  '잔인',
  '거',
  '싫어하지',
  '참고',
  '볼',
  '만하',
  '로미',
  '인물',
  '왜',
  '그런',
  '모르'],
 ['온종일', '불편한', '피', '범벅', '일'],
 ['답답함',
  '극치',
  '움직일',
  '잇으',
  '좀',
  '움직여',
  '어지간히',
  '좀비',
  '봣으',
  '얼',
  '타고',
  '때려',
  '잡',
  '때',
  '되',
  '않냐']]

## 1. Word2Vec 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/word2vec.html

### Skipgram, negative=10 인 경우

In [18]:
# Word2Vec 모델 생성 및 학습 : window=3, min_count=3
from gensim.models import Word2Vec
model_sg_n10 = Word2Vec(corpus, window=3, min_count=5, vector_size=100, sg=1, negative=10)

In [19]:
# 단어의 임베딩 벡터 확인
model_sg_n10.wv['이정재']

array([-0.2658595 ,  0.39390358,  0.02224205, -0.8661706 , -0.21638532,
        0.08719538,  0.48914543,  0.3219268 ,  0.8454138 , -0.36106464,
       -0.2307851 , -0.01457302, -0.1531674 , -0.01445221,  0.2944134 ,
        0.5008894 , -0.23787107,  0.0691911 ,  0.38722304, -0.20807546,
       -0.10368502,  0.13350669, -0.18049921,  0.10634366,  0.08439777,
       -0.22513582,  0.13746312, -0.24576361,  0.29121244,  0.4292597 ,
        0.02653951,  0.0331934 , -0.04380709,  0.09234158, -0.12247363,
       -0.61935043, -0.28282064,  0.19389085, -0.3499114 , -0.25204843,
       -0.31342804, -0.11734725, -0.29789641, -0.7170295 , -0.19706689,
        0.4377187 , -0.20603645, -0.55613095, -0.08065166,  0.4401043 ,
        0.23163581, -0.13827622, -0.4534499 , -0.21800683,  0.03612949,
        0.14898993,  0.00961852, -0.30055928,  0.32316798,  0.19756322,
       -0.6172792 , -0.60917664,  0.22700721, -0.40709478, -0.5209116 ,
       -0.40875262,  0.4278314 , -0.28481165, -0.10012425, -0.07

In [20]:
# 단어의 임베딩 벡터 차원 확인
len(model_sg_n10.wv['이정재'])

100

In [21]:
# 두 단어 간 유사도 확인
model_sg_n10.wv.similarity('이정재','정우성')

0.7612108

In [22]:
# 특정 단어와 유사한 단어 추출
model_sg_n10.wv.most_similar('이정재',topn=20)

[('이범수', 0.8243438005447388),
 ('공유', 0.8150244355201721),
 ('송강호', 0.8125758767127991),
 ('김범수', 0.7689306735992432),
 ('정우성', 0.7612107992172241),
 ('이병헌', 0.7574672102928162),
 ('김남길', 0.7496175169944763),
 ('박해일', 0.7491560578346252),
 ('마동석', 0.7451098561286926),
 ('주지훈', 0.7345482707023621),
 ('리암', 0.7294881343841553),
 ('조재현', 0.7278708219528198),
 ('김명민', 0.71332186460495),
 ('이성민', 0.7089771628379822),
 ('송광호', 0.7087903022766113),
 ('곽도원', 0.7044089436531067),
 ('김윤석', 0.7032412886619568),
 ('이진욱', 0.7008427977561951),
 ('오지호', 0.6980786919593811),
 ('정진영', 0.6962241530418396)]

### Skipgram, negative=5 인 경우

In [13]:
# 모델 생성
model_sg_n5 = Word2Vec(corpus, window=3, min_count=5, vector_size=100, sg=1, negative=5)

In [23]:
# 특어 단어와 유사한 단어 추출 : 이정재
model_sg_n5.wv.most_similar('이정재',topn=20)

[('송강호', 0.8257206082344055),
 ('이범수', 0.8154950141906738),
 ('공유', 0.7617592215538025),
 ('김범수', 0.7581908702850342),
 ('이병헌', 0.7308306694030762),
 ('이성민', 0.7300959825515747),
 ('조재현', 0.7200732827186584),
 ('김윤석', 0.7166599631309509),
 ('마동석', 0.714489758014679),
 ('박해일', 0.7137069702148438),
 ('리암', 0.7012619376182556),
 ('송광호', 0.6955860257148743),
 ('정우성', 0.6945518851280212),
 ('황정민', 0.6920634508132935),
 ('주지훈', 0.688340425491333),
 ('요한', 0.6882101893424988),
 ('김남길', 0.6853874325752258),
 ('김성균', 0.6836909055709839),
 ('김명민', 0.6809912323951721),
 ('곽도원', 0.680762529373169)]

In [12]:
# 특어 단어와 유사한 단어 추출 : 재밌
model_sg_n5.wv.most_similar('재밌',topn=20)

[('재미있', 0.9003034830093384),
 ('재밌네', 0.8160199522972107),
 ('잼남', 0.8119663000106812),
 ('재밌었', 0.8117966651916504),
 ('재밋음', 0.7992177605628967),
 ('재밌어', 0.7880213260650635),
 ('재밋었음', 0.7675364017486572),
 ('재밋엇음', 0.7616010904312134),
 ('재미있었', 0.7539824843406677),
 ('재밋어용', 0.74974125623703),
 ('재밋구', 0.7482808828353882),
 ('잼슴', 0.7479516267776489),
 ('재밋네', 0.7477021813392639),
 ('재밋엇', 0.7448346018791199),
 ('재밌아', 0.7411665916442871),
 ('재밌슴', 0.7400686144828796),
 ('재밋습니', 0.7386003732681274),
 ('재밋엇어용', 0.7379406690597534),
 ('재미있네', 0.7378557920455933),
 ('잼난', 0.7344793677330017)]

### CBOW, negative=10 인 경우

In [15]:
model_cb_n10 = Word2Vec(corpus, window=3, min_count=5, vector_size=100, sg=0, negative=10)

In [24]:
model_cb_n10.wv.most_similar('이정재',topn=20)

[('이범수', 0.7677269577980042),
 ('공유', 0.7487688660621643),
 ('김윤석', 0.7188482880592346),
 ('김범수', 0.7028162479400635),
 ('송강호', 0.6919775605201721),
 ('조재현', 0.6860628128051758),
 ('이성민', 0.6822505593299866),
 ('주지훈', 0.6788458228111267),
 ('이진욱', 0.6760426163673401),
 ('김남길', 0.6677210927009583),
 ('박해일', 0.6664465665817261),
 ('하정우', 0.6517153382301331),
 ('마동석', 0.6511935591697693),
 ('차승원', 0.6497564315795898),
 ('정우성', 0.6421746015548706),
 ('이병헌', 0.6376791596412659),
 ('김성오', 0.6360779404640198),
 ('김성균', 0.6333326697349548),
 ('조정석', 0.6309075951576233),
 ('김영애', 0.6284968256950378)]

In [26]:
model_cb_n10.wv.most_similar('재밌',topn=20)

[('재미있', 0.8904844522476196),
 ('재밋음', 0.7990494966506958),
 ('재밌어', 0.7963995337486267),
 ('재밌네', 0.7950748801231384),
 ('재밌었', 0.7788784503936768),
 ('재미있었', 0.7061181664466858),
 ('재밌는', 0.7050960659980774),
 ('재미있네', 0.6997734904289246),
 ('재밌더', 0.6996801495552063),
 ('재밌던', 0.6981449127197266),
 ('잼남', 0.6959388256072998),
 ('재밋어', 0.6947115659713745),
 ('재밋엇어', 0.6791515350341797),
 ('재밋네', 0.6745824813842773),
 ('재미있던', 0.6690869331359863),
 ('재미있어', 0.6596861481666565),
 ('재밌다', 0.6564862132072449),
 ('꿀잼', 0.6544492244720459),
 ('재밋었어', 0.6487863063812256),
 ('재밋', 0.6480990648269653)]

### CBOW, negative=5 인 경우

In [27]:
model_cb_n5 = Word2Vec(corpus, window=3, min_count=5, vector_size=100, sg=0, negative=5)

In [28]:
model_cb_n5.wv.most_similar('이정재',topn=20)

[('이범수', 0.7771064043045044),
 ('공유', 0.7497997879981995),
 ('송강호', 0.728468656539917),
 ('김윤석', 0.7275027632713318),
 ('조재현', 0.7008674740791321),
 ('김남길', 0.6988293528556824),
 ('이성민', 0.6951977014541626),
 ('박해일', 0.6946074962615967),
 ('김범수', 0.6829924583435059),
 ('이진욱', 0.6780548691749573),
 ('송광호', 0.6738556623458862),
 ('주지훈', 0.6633802652359009),
 ('마동석', 0.6504395008087158),
 ('정우성', 0.6396017074584961),
 ('김성오', 0.6380428671836853),
 ('이병헌', 0.6282176971435547),
 ('박철민', 0.6275760531425476),
 ('곽도원', 0.6270774602890015),
 ('하시모토', 0.6259323954582214),
 ('하정우', 0.6235322952270508)]

In [29]:
model_cb_n5.wv.most_similar('재밌',topn=20)

[('재미있', 0.905611515045166),
 ('재밌어', 0.8166906833648682),
 ('재밌네', 0.816689670085907),
 ('재밋음', 0.8064050674438477),
 ('재밌었', 0.8040375709533691),
 ('재밋어', 0.7425823211669922),
 ('재밌는', 0.7299968004226685),
 ('재밌더', 0.7283905744552612),
 ('재미있네', 0.7266055941581726),
 ('재미있었', 0.72214674949646),
 ('잼남', 0.7128613591194153),
 ('재밌던', 0.7070305347442627),
 ('재미있어', 0.6954905986785889),
 ('재밋엇어', 0.6864179968833923),
 ('재밋네', 0.6824601888656616),
 ('재밋었', 0.6784105896949768),
 ('재밋', 0.6752181649208069),
 ('꿀잼', 0.6739833354949951),
 ('재밋었어', 0.6660357713699341),
 ('재밌다', 0.6609926223754883)]

### OOV(Out of Vocabulary) 문제

In [38]:
model = Word2Vec(corpus, min_count=5, vector_size=100, sg=0, negative=5, window=5)

In [39]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화'in model.wv.key_to_index

False

In [40]:
# corpus에 없는 단어의 임베딩 벡터 확인 
model.wv['우주평화']

KeyError: "Key '우주평화' not present"

## 2. FastText 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/fasttext.html

In [32]:
# FastText 모델 생성 및 학습
# window=3, min_count=3, min_n=2, max_n=2
from gensim.models import FastText

ft_model = FastText(corpus, window=3, min_count=3, min_n=2, max_n=2, vector_size=100, negative=10, sg=1)

In [33]:
# 특정 단어와 유사한 단어 추출 : 이정재
ft_model.wv['이정재']

array([-0.2577292 ,  0.17329368, -0.3754199 , -0.32856494, -0.12742265,
       -0.6323372 , -0.2217106 ,  0.31532785,  0.36865568,  0.14371884,
       -0.40570968,  0.03099389, -0.41982985, -0.19070256,  0.00285044,
        0.03165355,  0.04903513, -0.39672107,  0.41524467, -0.2832387 ,
        0.3583615 , -0.05406259,  0.20079935, -0.12084372, -0.12057388,
       -0.35559556, -0.03268985, -0.1483912 , -0.2697585 , -0.39387232,
        0.06975166, -0.6311543 ,  0.13582619, -0.06232803, -0.34265882,
       -0.09640139,  0.2946102 , -0.17576402, -0.16427256,  0.00104353,
       -0.4782653 ,  0.19217606, -0.461092  , -0.32331085, -0.70007265,
       -0.08973248, -0.26611832, -0.26291585,  0.27970874, -0.00636222,
       -0.02924528, -0.11496314, -0.21728334,  0.23375496, -0.14219128,
       -0.00859776,  0.3326637 ,  0.24742556,  0.18198338, -0.08031321,
        0.0380939 ,  0.01491066, -0.00566604,  0.0926085 , -0.3804329 ,
        0.0331972 ,  0.4536224 , -0.19083178,  0.23906286, -0.18

In [34]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화' in ft_model.wv.key_to_index

False

In [35]:
# corpus에 없는 단어의 임베딩 벡터 확인 
ft_model.wv['우주평화']

array([ 1.53381631e-01,  2.95297448e-02, -2.13220686e-01,  9.47948918e-02,
       -3.48976441e-02, -3.84734213e-01, -6.18579015e-02,  9.94628131e-01,
        4.88372713e-01,  3.96967351e-01,  1.35262683e-01,  2.41364762e-01,
        1.26746342e-01,  2.42449760e-01, -2.97186941e-01, -1.83013678e-01,
        2.77399756e-02, -3.40905726e-01,  6.17765114e-02,  9.24847648e-02,
        8.71172547e-02, -1.47875518e-01, -2.63922811e-02,  5.11507019e-02,
        4.79513295e-02, -1.71684414e-01, -4.08334076e-01, -2.61529386e-01,
       -2.35177487e-01, -4.22659785e-01,  2.48135477e-01, -2.21979424e-01,
        6.08067214e-03, -2.50706732e-01, -4.01603460e-01,  4.28275578e-02,
        2.30779618e-01,  1.40266714e-03, -2.71038949e-01, -8.20936635e-02,
       -2.09011197e-01, -4.93200719e-02, -1.04448959e-01,  4.79785576e-02,
       -2.22055942e-01,  1.37243152e-01,  9.86741409e-02,  1.22004749e-04,
       -1.19684890e-01,  1.20177746e-01,  2.06386656e-01,  3.10169339e-01,
        8.70683193e-02, -

In [36]:
# corpus에 없는 단어와 유사한 단어추출 
ft_model.wv.most_similar('우주평화')

[('우주', 0.8236703276634216),
 ('우주비행사', 0.8051888942718506),
 ('우방', 0.8020423650741577),
 ('우장', 0.7969403266906738),
 ('평화', 0.795060932636261),
 ('우주인', 0.7949032187461853),
 ('지구촌', 0.787720263004303),
 ('대영제국', 0.7865938544273376),
 ('아우슈비츠', 0.7850265502929688),
 ('쑥대밭', 0.7777277231216431)]