In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm # 진행률 프로세스바
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from konlpy.tag import Okt
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_parquet('after_preprocessing.parquet', columns=['source', 'content'])

In [47]:
# 키워드 리스트 불러오기
keywordList = pd.read_parquet('top100_keyword.parquet')

In [52]:
keywordList[keywordList['source'] == '종달리수국길']

Unnamed: 0,source,keyword
95,종달리수국길,사진 종달리해안도로 성산일출봉 동쪽 산책 계획 바람 바다 기도 식당 여름 자연 거리...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361706 entries, 0 to 361705
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   source   361706 non-null  object
 1   content  361706 non-null  object
dtypes: object(2)
memory usage: 5.5+ MB


In [3]:
tourlist = [
 '에코랜드 테마파크',
 '성산일출봉(UNESCO 세계자연유산)',
 '천아숲길 천아계곡',
 '종달리수국길' ,
 '녹산로유채꽃길'] 

In [4]:
tourDf = df[df['source'].isin(tourlist)].reset_index()

In [5]:
tourDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2327 entries, 0 to 2326
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   index    2327 non-null   int64 
 1   source   2327 non-null   object
 2   content  2327 non-null   object
dtypes: int64(1), object(2)
memory usage: 54.7+ KB


In [6]:
def tokenizedOkt(sentences):
    okt = Okt()
    total_tokenized_list = []
    # 해당 관광지에 관한 content들 토큰화
    tqdmDf = tqdm(sentences)
    count = 0
    
    for tourInfo in tqdmDf:
        tqdmDf.set_description(f'Processing tokenized')
        stop_word = ['있는','있고','있으니','입니다.','이렇게','있을', tourDf['source'][count], tourDf['source'][count].replace(' ',''),'네이버','블로그']
        tokenized_doc = okt.pos(tourInfo)
        
        tokenized_list = [w for w,t in tokenized_doc if t not in ['Verb'] and w not in stop_word]
        total_tokenized_list.append(tokenized_list)
        
    return total_tokenized_list

In [101]:
# 1409 index content 빈 값 확인 및 제거
tourDf['content'][1409]

nan

In [7]:
tourDf['content'].replace('', np.nan, inplace=True)
tourDf = tourDf[tourDf['content'].notna()].reset_index()

In [8]:
len(tourDf['content'])

2326

In [9]:
tokenDf = tokenizedOkt(tourDf['content'])

  0%|          | 0/2326 [00:00<?, ?it/s]

## Word2Vec 성능확인
- CBOW = 중간 단어 예측, skip gram = 중간 단어로 주변 단어 예측
- CBOW 같은 경우 한 문장에 대해서 한 단어는 1번 학습되지만
- skip gram 같은 경우 단어 별로 여러번 학습 된다.


- size = 워드 벡터의 특징 값. 즉, 임베딩 된 벡터의 차원.
- window = 컨텍스트 윈도우 크기
- min_count = 단어 최소 빈도 수 제한 (빈도가 적은 단어들은 학습하지 않는다.)
- workers = 학습을 위한 프로세스 수
- sg = 0은 CBOW, 1은 Skip-gram

### skip-gram

In [53]:
# skip gram
word2vec_model_skipgram = Word2Vec(sentences = tokenDf, size = 100, window = 5, min_count = 5, workers = 4, sg = 1)

In [54]:
word2vec_model_skipgram.wv.vectors.shape

(11146, 100)

In [13]:
word2vec_model_skipgram.wv.most_similar('수국')

[('달리', 0.733333945274353),
 ('피어', 0.6924291253089905),
 ('노지', 0.6785334348678589),
 ('꽃들이', 0.6482338905334473),
 ('파스텔', 0.632415235042572),
 ('활짝', 0.6301947832107544),
 ('성하고', 0.6262366771697998),
 ('담벼락', 0.6247830390930176),
 ('꽃망울', 0.6224348545074463),
 ('벗꽃', 0.6200156211853027)]

In [26]:
word2vec_model_skipgram.wv.most_similar('화사한')

[('초여름', 0.9250891208648682),
 ('화사하게', 0.9246084690093994),
 ('눈부시게', 0.9207310676574707),
 ('완연', 0.9206932783126831),
 ('화려하게', 0.9119160175323486),
 ('벗꽃', 0.9106734991073608),
 ('향연', 0.9082958102226257),
 ('물감', 0.906929612159729),
 ('아름답게', 0.9067778587341309),
 ('이맘', 0.9061497449874878)]

In [55]:
word2vec_model_skipgram.wv.most_similar('산책')

[('걷기', 0.7851150035858154),
 ('가볍게', 0.7701570987701416),
 ('천천히', 0.7641201019287109),
 ('산책길', 0.7541016340255737),
 ('둘러보기', 0.7435195446014404),
 ('트랙', 0.7385668158531189),
 ('산책로', 0.7258861064910889),
 ('한적한', 0.723000168800354),
 ('사색', 0.7125638127326965),
 ('한적하게', 0.7119795083999634)]

In [56]:
# skip gram
word2vec_model_skipgram = Word2Vec(sentences = tokenDf, size = 100, window = 3, min_count = 5, workers = 4, sg = 1)

In [28]:
word2vec_model_skipgram.wv.most_similar('수국')

[('달리', 0.7263014316558838),
 ('피어', 0.6721788644790649),
 ('꽃들이', 0.670640230178833),
 ('길가', 0.6597679257392883),
 ('활짝', 0.6579025983810425),
 ('꽃길', 0.657056450843811),
 ('동백꽃', 0.6507202982902527),
 ('담벼락', 0.6462448239326477),
 ('벗꽃', 0.6460666656494141),
 ('화사하게', 0.6455676555633545)]

In [29]:
word2vec_model_skipgram.wv.most_similar('화사한')

[('화사하게', 0.9538018703460693),
 ('화려하게', 0.948042631149292),
 ('완연', 0.9470117688179016),
 ('발해', 0.9387092590332031),
 ('빨갛게', 0.936212420463562),
 ('울긋불긋', 0.9354969263076782),
 ('초여름', 0.9341397881507874),
 ('환하게', 0.9340221285820007),
 ('콜라보', 0.9327752590179443),
 ('늦가을', 0.9308168888092041)]

In [57]:
word2vec_model_skipgram.wv.most_similar('산책')

[('걷기', 0.7937313318252563),
 ('천천히', 0.7898274660110474),
 ('가볍게', 0.7863270044326782),
 ('사색', 0.7787995338439941),
 ('한적하게', 0.7655798196792603),
 ('둘러보기', 0.7573834657669067),
 ('트랙', 0.7500003576278687),
 ('산책길', 0.7492039203643799),
 ('조용히', 0.742107093334198),
 ('접근성', 0.741938591003418)]

In [58]:
# skip gram
word2vec_model_skipgram = Word2Vec(sentences = tokenDf, size = 100, window = 1, min_count = 5, workers = 4, sg = 1)

In [31]:
word2vec_model_skipgram.wv.most_similar('수국')

[('튤립', 0.7738156318664551),
 ('동백꽃', 0.7635685205459595),
 ('꽃밭', 0.7510963678359985),
 ('유채', 0.7509618997573853),
 ('유채꽃', 0.7378582954406738),
 ('동백', 0.7374162673950195),
 ('벚꽃', 0.732453465461731),
 ('청보리', 0.7310733795166016),
 ('꽃들이', 0.7246082425117493),
 ('코스모스', 0.7151890993118286)]

In [32]:
word2vec_model_skipgram.wv.most_similar('화사한')

[('드러지게', 0.968941867351532),
 ('화사하게', 0.9653894901275635),
 ('형형색색', 0.9631203413009644),
 ('천지', 0.9590035676956177),
 ('화려하게', 0.9565664529800415),
 ('오색', 0.9557088017463684),
 ('성하게', 0.9528377652168274),
 ('콜라보', 0.952121376991272),
 ('울긋불긋', 0.9506763815879822),
 ('빨갛게', 0.950185239315033)]

In [59]:
word2vec_model_skipgram.wv.most_similar('산책')

[('걷기', 0.8588511347770691),
 ('천천히', 0.8297776579856873),
 ('둘러보기', 0.8247727155685425),
 ('접근성', 0.8156402707099915),
 ('가볍게', 0.8035635948181152),
 ('찾기', 0.8008365631103516),
 ('바퀴', 0.7875552773475647),
 ('마로', 0.7825027704238892),
 ('운동', 0.7696425914764404),
 ('산책로', 0.7630506157875061)]

In [60]:
# skip gram
word2vec_model_skipgram = Word2Vec(sentences = tokenDf, size = 300, window = 1, min_count = 5, workers = 4, sg = 1)

In [34]:
word2vec_model_skipgram.wv.most_similar('수국')

[('동백꽃', 0.7745952010154724),
 ('튤립', 0.772774875164032),
 ('벚꽃', 0.7565107345581055),
 ('유채', 0.7555995583534241),
 ('동백', 0.7477849721908569),
 ('꽃밭', 0.7424488663673401),
 ('유채꽃', 0.7341821193695068),
 ('청보리', 0.7325412631034851),
 ('꽃들이', 0.7162615656852722),
 ('봄꽃', 0.7143723964691162)]

In [35]:
word2vec_model_skipgram.wv.most_similar('화사한')

[('드러지게', 0.9706772565841675),
 ('화사하게', 0.9667786359786987),
 ('형형색색', 0.9663385152816772),
 ('천지', 0.9630934000015259),
 ('화려하게', 0.9599868655204773),
 ('오색', 0.9584367275238037),
 ('성하게', 0.9574066400527954),
 ('콜라보', 0.9549382925033569),
 ('알록달록한', 0.9548102021217346),
 ('울긋불긋', 0.9540122151374817)]

In [61]:
word2vec_model_skipgram.wv.most_similar('산책')

[('걷기', 0.8593374490737915),
 ('천천히', 0.8280429840087891),
 ('둘러보기', 0.816077470779419),
 ('접근성', 0.8089202642440796),
 ('가볍게', 0.8063005805015564),
 ('찾기', 0.7998758554458618),
 ('바퀴', 0.7874067425727844),
 ('마로', 0.7863445281982422),
 ('드라이브', 0.7744019627571106),
 ('산책로', 0.7647382020950317)]

In [62]:
# skip gram
word2vec_model_skipgram = Word2Vec(sentences = tokenDf, size = 500, window = 1, min_count = 5, workers = 4, sg = 1)
print(f'수국 : {word2vec_model_skipgram.wv.most_similar("수국")}')
print(f'화사한 : {word2vec_model_skipgram.wv.most_similar("화사한")}')
print(f'산책 : {word2vec_model_skipgram.wv.most_similar("산책")}')

수국 : [('동백꽃', 0.7765324115753174), ('튤립', 0.7724248170852661), ('벚꽃', 0.7570624351501465), ('동백', 0.7569960355758667), ('유채', 0.7500444650650024), ('꽃밭', 0.7352426648139954), ('유채꽃', 0.7308177947998047), ('청보리', 0.729303777217865), ('봄꽃', 0.7167365550994873), ('꽃들이', 0.71485435962677)]
화사한 : [('드러지게', 0.9727051258087158), ('화사하게', 0.9696004986763), ('형형색색', 0.9681439399719238), ('천지', 0.9625675082206726), ('화려하게', 0.9612523913383484), ('오색', 0.9602988958358765), ('성하게', 0.9595808982849121), ('콜라보', 0.9594618082046509), ('알록달록한', 0.9590699672698975), ('그야말로', 0.9590541124343872)]
산책 : [('걷기', 0.8698867559432983), ('천천히', 0.8309049010276794), ('둘러보기', 0.8188114762306213), ('접근성', 0.8164956569671631), ('가볍게', 0.8095284104347229), ('찾기', 0.8062119483947754), ('바퀴', 0.7922258973121643), ('마로', 0.7885223627090454), ('드라이브', 0.7867914438247681), ('단거리', 0.7674313187599182)]


### CBOW

In [63]:
# CBOW
word2vec_model_CBOW = Word2Vec(sentences = tokenDf, size = 100, window = 5, min_count = 5, workers = 4, sg = 0)

In [37]:
word2vec_model_CBOW.wv.vectors.shape

(11146, 100)

In [38]:
word2vec_model_CBOW.wv.most_similar('수국')

[('유채꽃', 0.9000601768493652),
 ('벚꽃', 0.8913075923919678),
 ('활짝', 0.8514318466186523),
 ('노오', 0.844367504119873),
 ('유채', 0.834045946598053),
 ('만발', 0.8297327756881714),
 ('피어', 0.8182817101478577),
 ('튤립', 0.7992562055587769),
 ('꽃길', 0.7991666793823242),
 ('스팟', 0.7983413934707642)]

In [39]:
word2vec_model_CBOW.wv.most_similar('화사한')

[('이지', 0.9758896231651306),
 ('아름답게', 0.9749916791915894),
 ('초봄', 0.9659621715545654),
 ('푸릇푸릇', 0.962666392326355),
 ('초여름', 0.9591946005821228),
 ('감탄사', 0.9565547704696655),
 ('눈부시게', 0.9535303115844727),
 ('예뻐요', 0.9500758647918701),
 ('있죠', 0.9453238844871521),
 ('화려하게', 0.9434857964515686)]

In [64]:
word2vec_model_CBOW.wv.most_similar('산책')

[('걷기', 0.8823663592338562),
 ('천천히', 0.8139923810958862),
 ('좋다', 0.8027373552322388),
 ('가볍게', 0.7844594717025757),
 ('좋고', 0.7794161438941956),
 ('장거리', 0.7769668102264404),
 ('좋은', 0.7768592834472656),
 ('단거리', 0.775915265083313),
 ('여유', 0.7715490460395813),
 ('산책로', 0.760635256767273)]

In [65]:
# CBOW
word2vec_model_CBOW = Word2Vec(sentences = tokenDf, size = 100, window = 3, min_count = 5, workers = 4, sg = 0)
print(f'수국 : {word2vec_model_CBOW.wv.most_similar("수국")}')
print(f'화사한 : {word2vec_model_CBOW.wv.most_similar("화사한")}')
print(f'산책 : {word2vec_model_CBOW.wv.most_similar("산책")}')

수국 : [('벚꽃', 0.913415253162384), ('유채꽃', 0.8545932769775391), ('꽃길', 0.8221601843833923), ('활짝', 0.8120996952056885), ('만발', 0.8041905164718628), ('노오', 0.8012207746505737), ('동백꽃', 0.800965428352356), ('유채', 0.8002742528915405), ('종', 0.7988065481185913), ('아직', 0.7898870706558228)]
화사한 : [('아름답게', 0.9705498218536377), ('하늘하늘', 0.9660200476646423), ('이지', 0.9641664028167725), ('화려하게', 0.9641305208206177), ('콜라보', 0.9638370275497437), ('향연', 0.9582933187484741), ('있죠', 0.9582161903381348), ('자연도', 0.9565824270248413), ('백정원', 0.956558346748352), ('발해', 0.9557619094848633)]
산책 : [('걷기', 0.916069746017456), ('천천히', 0.8504164814949036), ('단거리', 0.845700204372406), ('등산', 0.8344854116439819), ('장거리', 0.8322576880455017), ('있게', 0.8258826732635498), ('바퀴', 0.8211677670478821), ('가볍게', 0.8197581768035889), ('좋다', 0.8155641555786133), ('여유', 0.799348771572113)]


In [66]:
# CBOW
word2vec_model_CBOW = Word2Vec(sentences = tokenDf, size = 100, window = 1, min_count = 5, workers = 4, sg = 0)
print(f'수국 : {word2vec_model_CBOW.wv.most_similar("수국")}')
print(f'화사한 : {word2vec_model_CBOW.wv.most_similar("화사한")}')
print(f'산책 : {word2vec_model_CBOW.wv.most_similar("산책")}')

수국 : [('동백꽃', 0.8639358282089233), ('유채꽃', 0.8637495040893555), ('종', 0.8392490148544312), ('유채', 0.8294531106948853), ('튤립', 0.8285108804702759), ('동백', 0.828164279460907), ('뮬리', 0.8104469180107117), ('벚꽃', 0.8103405237197876), ('녹', 0.8055287599563599), ('리가', 0.7955368757247925)]
화사한 : [('하늘색', 0.9805043935775757), ('아름답게', 0.9789599180221558), ('성한', 0.9785846471786499), ('보라', 0.9782240390777588), ('드러지게', 0.9756938815116882), ('화려하게', 0.9747105240821838), ('붉게', 0.9743005037307739), ('울긋불긋', 0.9725830554962158), ('화사하게', 0.9724209308624268), ('한가득', 0.9716953039169312)]
산책 : [('걷기', 0.8941426277160645), ('바퀴', 0.8780102729797363), ('마로', 0.8771822452545166), ('가볍게', 0.8674518465995789), ('넣기', 0.8584790229797363), ('드라이브', 0.8521929383277893), ('천천히', 0.846778929233551), ('단거리', 0.8382279872894287), ('산책길', 0.8227189779281616), ('장거리', 0.8222993016242981)]


#### 결과: 
- CBOW와 skip-gram을 비교하였을 때, 유사한 단어의 목록을 통해서 skip-gram이 더 좋은 성능을 보인다. 


- skip-gram의 파라미터 
- window = 컨텍스트 윈도우 크기는 1 ~ 5 까지 비교하였을 때, 
- 화사한, 산책이라는 단어를 기준으로 하였을 경우 큰 차이는 보이지 않았지만 3 ~ 5가 더 좋은 결과를 보였다.
- 수국이라는 단어를 기준으로는 3 ~ 5의 경우 의미없는 단어가 포함되어 있었지만 1의 경우 의미없는 단어가 포함되지 않았다.
- size = 워드 벡터의 차원은 100 ~ 500 까지 비교하였을 때, 
- 100, 300, 500의 결과는 큰 차이를 보이지 않았으며 100의 경우가 의미없는 단어의 수가 가장 적었다.


#### skip-gram의 파라미터는 총 데이터 양에 따라 변화할 수 있지만, 현재 데이터 양을 기준으로는 word2vec_model_skipgram = Word2Vec(sentences = tokenDf, size = 100, window = 1, min_count = 5, workers = 4, sg = 1)가 적절한 파라미터로 판단된다.
