# KoNLTK klt2023 형태소 분석기로 model 및 문장벡터 생성

In [1]:
import sys
import gc
import os
import re
import time
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from datetime import datetime
# import chromedriver_autoinstaller

from matplotlib import font_manager, rc
# 차트에서 한글 출력을 위한 설정

import matplotlib.pyplot as plt
import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common. by import By

from bs4 import BeautifulSoup 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import konlpy
from konlpy.tag import Okt
from konlp.kma.klt2023 import klt2023
import nltk
import stanza
from gensim.models import Word2Vec, word2vec
from gensim.models import FastText

from wordcloud import WordCloud
from numpy import dot
from numpy.linalg import norm
import itertools

# 1) Word2Vec, FastText Model 생성

### 전처리

In [3]:
# 중복 제거 및 데이터 불러오기
df = pd.read_csv('book_all.csv').drop_duplicates().reset_index(drop=True)

In [4]:
# 책소개에 이모티콘과 문장 부호가 많아, 더 깔끔한 데이터 확보를 위해 제거해줌.

for i in range(len(df)):
    df['책소개'][i] = re.sub("[^A-Za-z0-9가-힣]",   # 바꿀패턴:영어, 숫자, 한글이 아닌 모든것 제거
                      " ",                   # 바뀐내용:공백으로 바꿔라
                      df['책소개'][i] )

### 형태소 분석기 정의

In [5]:
k=klt2023()

In [6]:
all_word = []
for i in range(len(df)):
    all_word.append(df['제목'][i]+' '+df['저자'][i]+' '+df['책소개'][i]+' '+df['카테고리'][i])

해당 텍스트 형태소 분석 - 명사 추출  

In [7]:
start = datetime.now()
all_nouns = []
for i in all_word:
    all_nouns.append(k.nouns(i))
end = datetime.now()
print('소요시간 :', end-start)

소요시간 : 0:00:31.883494


In [8]:
len(list(itertools.chain(*all_nouns)))

453196

### 불용어 처리  
- https://deep.chulgil.me/hangugeo-bulyongeo-riseuteu/

In [9]:
ko_Stopword = pd.read_csv('한국어 불용어.csv')['불용어'].to_list()

In [10]:
total_nouns = []
for i in all_nouns:
    _=[]
    for j in i:
        if j not in ko_Stopword:
            _.append(j)
    total_nouns.append(_)

In [11]:
len(list(itertools.chain(*total_nouns)))

381862

### word2vec/fasttext model 생성  

Word2Vec Model

In [12]:
model_wv = word2vec.Word2Vec(sentences=total_nouns, vector_size=400, window=5, min_count=1)
model_wv.save('Word2Vec_model/word2vec_book_nltk.model')

FastText Model

In [13]:
model_ft = FastText(total_nouns, vector_size=400, window=5, min_count=2)
model_ft.save('FastText_model/fasttext_book_nltk.model')

# 2) 책 소개 벡터 생성하기

In [14]:
content = df['책소개'].to_list()

각 책소개마다 명사 추출

In [15]:
content_word=[]
for i in content:
    content_word.append(k.nouns(i))

In [16]:
df['책소개_명사'] = content_word

불용어 처리 및 중복제거
(문장 벡터를 만드는 과정에서 같은 벡터가 여러번 더해지지 않도록 중복 제거)

In [17]:
total_word = []
for i in content_word:
    _=[]
    for j in list(set(i)):
        if j not in ko_Stopword:
            _.append(j)
    total_word.append(_)

In [18]:
len(list(itertools.chain(*content_word))), len(list(itertools.chain(*total_word)))

(427324, 260647)

책 소개별 추출된 명사를 합하여 책소개 벡터 생성

In [19]:
vec=[]
for i in total_word:
    v=np.zeros(400)
    for k in i:
        v += model_ft.wv.get_vector(k)
    vec.append(v)
print(len(vec))

4788


In [20]:
df['책소개_벡터'] = vec

In [21]:
df.to_csv('books_vector_nltk.csv',index=False, encoding="utf-8-sig")