# Stanza 형태소 분석기로 model 및 문장 벡터 생성

In [1]:
import sys
import gc
import os
import re
import time
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from datetime import datetime
# import chromedriver_autoinstaller

from matplotlib import font_manager, rc
# 차트에서 한글 출력을 위한 설정

import matplotlib.pyplot as plt
import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common. by import By

from bs4 import BeautifulSoup 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import konlpy
from konlpy.tag import Okt
from konlp.kma.klt2023 import klt2023
import nltk
import stanza
from gensim.models import Word2Vec, word2vec
from gensim.models import FastText

from wordcloud import WordCloud
from numpy import dot
from numpy.linalg import norm
import itertools

# 1) Word2Vec, FastText Model 생성

### 전처리

In [3]:
# 중복 제거 및 데이터 불러오기
df = pd.read_csv('book_all.csv').drop_duplicates().reset_index(drop=True)

In [4]:
# 책소개에 이모티콘과 문장 부호가 많아, 더 깔끔한 데이터 확보를 위해 제거해줌.
for i in range(len(df)):
    df['책소개'][i] = re.sub("[^A-Za-z0-9가-힣]",   # 바꿀패턴:영어, 숫자, 한글이 아닌 모든것 제거
                      " ",                   # 바뀐내용:공백으로 바꿔라
                      df['책소개'][i] )

### 불용어 처리  
- https://deep.chulgil.me/hangugeo-bulyongeo-riseuteu/

In [3]:
ko_Stopword = pd.read_csv('한국어 불용어.csv')['불용어'].to_list()

### 형태소 분석기 정의

In [4]:
stanza.download('ko')
nlp = stanza.Pipeline('ko')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-19 02:39:35 INFO: Downloading default packages for language: ko (Korean) ...
2023-04-19 02:39:35 INFO: File exists: C:\Users\rudtj\stanza_resources\ko\default.zip
2023-04-19 02:39:36 INFO: Finished downloading models and saved to C:\Users\rudtj\stanza_resources.
2023-04-19 02:39:36 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-04-19 02:39:37 INFO: Loading these models for language: ko (Korean):
| Processor | Package |
-----------------------
| tokenize  | kaist   |
| pos       | kaist   |
| lemma     | kaist   |
| depparse  | kaist   |

2023-04-19 02:39:39 INFO: Using device: cuda
2023-04-19 02:39:39 INFO: Loading: tokenize
2023-04-19 02:39:43 INFO: Loading: pos
2023-04-19 02:39:43 INFO: Loading: lemma
2023-04-19 02:39:44 INFO: Loading: depparse
2023-04-19 02:39:44 INFO: Done loading processors!


In [7]:
def extract_nouns_stanza(text):
    doc = nlp(text)
    for sentence in doc.sentences:
        for word in sentence.words:
            lemma = word.lemma.split('+')
            xpos = word.xpos.split('+')
            for lem, pos in zip(lemma, xpos):
                if pos.startswith('n'):
                    yield lem

In [8]:
cv = CountVectorizer(tokenizer=extract_nouns_stanza, stop_words=ko_Stopword)

해당 텍스트 형태소 분석 - 명사 추출  

In [9]:
start = datetime.now()
all_nouns = []
for i in range(len(df)):
    cv.fit_transform(df.iloc[i,:])
    all_nouns.append(cv.get_feature_names())
end = datetime.now()
print('소요시간 :', end-start)    

소요시간 : 0:22:29.184364


In [10]:
import itertools
len(list(itertools.chain(*all_nouns)))

302814

In [11]:
total_nouns = []
for i in all_nouns:
    _=[]
    for j in i:
        if j not in ko_Stopword:
            _.append(j)
    total_nouns.append(_)

In [12]:
len(list(itertools.chain(*total_nouns)))

302814

### word2vec/fasttext model 생성  

Word2Vec Model

In [13]:
model_wv = word2vec.Word2Vec(sentences=total_nouns, vector_size=400, window=5, min_count=1)
model_wv.save('Word2Vec_model/word2vec_book_stanza.model')

FastText Model

In [14]:
model_ft = FastText(total_nouns, vector_size=400, window=5, min_count=2)
model_ft.save('FastText_model/fasttext_book_stanza.model')

# 2) 책 소개 벡터 생성하기

In [15]:
df['책소개_명사'] = all_nouns

불용어 처리 및 중복제거
(문장 벡터를 만드는 과정에서 같은 벡터가 여러번 더해지지 않도록 중복 제거)

In [16]:
total_word = []
for i in all_nouns:
    _=[]
    for j in list(set(i)):
        if j not in ko_Stopword:
            _.append(j)
    total_word.append(_)

In [17]:
len(list(itertools.chain(*all_nouns))), len(list(itertools.chain(*total_word)))

(302814, 302814)

책 소개별 추출된 명사를 합하여 책소개 벡터 생성

In [18]:
vec=[]
for i in total_word:
    v=np.zeros(400)
    for k in i:
        v += model_ft.wv.get_vector(k)
    vec.append(v)
print(len(vec))

4788


In [19]:
df['책소개_벡터'] = vec

In [20]:
df.to_csv('books_vector_stanza.csv',index=False, encoding="utf-8-sig")