In [1]:
from etl import *
import pandas as pd
import numpy as np
from keybert import KeyBERT
import pymysql
import ast
import re
from gensim.models import Word2Vec,KeyedVectors
from konlpy.tag import Okt

keyBertModel = KeyBERT("paraphrase-multilingual-MiniLM-L12-v2")

conn=pymysql.connect(host='localhost',port=int(3306),user='root',passwd='',db='dash_test')
cursor = conn.cursor(pymysql.cursors.DictCursor)

## Extract, Transform, and Load 

## extract

In [2]:

#-- extract updated books of each library -- #
rawbookinfo = extractAllLibBooksMultiThread('2022-08-12')
df = rawbookinfo.drop_duplicates(subset="ISBN")


#-- load books info in DB --#
cursor.execute(' SELECT ISBN FROM backend_dodomoabookinfo')
result = cursor.fetchall()
libinfo = pd.DataFrame(result)

#-- compare extracted book info with previous book info and get new books   --#
# extract ISBNs of new books and compare them with previous books
ISBNs = df['ISBN'].tolist()
BM = np.in1d(ISBNs,libinfo['ISBN'])


# extract new book's info by crowaling kyobobooks site
ISBNs = df[~BM]['ISBN']
docs = kyoboSaveMultiThread(ISBNs, thread_num = 10)

# merge the extracted texts with book info
orderlist = [i[1] for i in docs]
k = pd.DataFrame([orderlist,docs]).T
k.columns = 'ISBN','keywords'

# make them as a dataframe
newdf = df[~BM]
newdf = newdf.merge(k,left_on='ISBN',right_on='ISBN')


111003 완료 | 428 개 추출
111011 완료 | 599 개 추출
111007 완료 | 1267 개 추출
111022 완료 | 0 개 추출111008 완료 | 0 개 추출

111004 완료 | 0 개 추출
111009 완료 | 0 개 추출
111006 완료 | 0 개 추출
111010 완료 | 0 개 추출
111005 완료 | 0 개 추출
111012 완료 | 590 개 추출
111021 완료 | 388 개 추출
111013 완료 | 996 개 추출
111018 완료 | 574 개 추출
111030 완료 | 531 개 추출
111014 완료 | 0 개 추출
111016 완료 | 411 개 추출
111019 완료 | 0 개 추출111015 완료 | 0 개 추출

111020 완료 | 0 개 추출
Skip : 9791164262090
Skip : 9791164262106
kyoboBooks 추출완료


## Transform 

### extract book's keywords using keyBert 

In [9]:
# separate keywords and info to extract keywords
df = newdf.drop(columns=['keywords'])

# before extracting keywords
docs = newdf['keywords']

# dataframe with extracted keywords
dfWithKeywords = transform((df,docs),keyBertModel)

### update BookInfo For training Word2Vec

In [87]:
# back stringified list to list
def changeStringToList(strList) :
    return ast.literal_eval(strList)


#-- transform and change column name --#
a = list(map(changeStringToList, newdf['keywords']))
newBookInfo = pd.DataFrame(a)
newBookInfo.columns = [f'col{i}' for i in range(len(newBookInfo.columns))]


#-- concat with old one and save --#
# load a previous file (혹시 파일 잘못된 경우 bookinfo12.csv로 백업 해놨음)
bookInfo = pd.read_csv('./data/newbookinfo12.csv',index_col=0)

# concat new items with previous items
concatNewBookInfo = pd.concat([bookInfo,newBookInfo])


#-- drop duplicates by ISBN --#
concatNewBookInfo['col1'] = concatNewBookInfo['col1'].astype(int)
concatNewBookInfo = concatNewBookInfo.drop_duplicates(subset='col1')


#-- save --#
concatNewBookInfo.to_csv('./data/newbookinfo12.csv')

5281

In [None]:
# load items to make same condtions 
bookinfo = pd.read_csv('./data/newbookinfo12.csv',index_col=0)

# load corpus analyzer
okt = Okt()

# change rows to lists and join them in a string 
def mergeListToString(item:pd.Series) :
    wordList = item.astype(str).tolist()
    str_list = [re.sub('\d','',str(a)) for a in wordList]
    str_list = list(filter(None, str_list))
    result = ' '.join(str_list)
    return result

# iterate all rows of bookinfo
wordsList=[mergeListToString(i[1]) for i in bookinfo.iterrows()]
print('Complete wordsList Load!!')

# analyze corpus
print('konelpy 실행 중... 평균 9분 소요')
konlpyWords = list(map(lambda x : okt.nouns(x),wordsList ))
print('Complete konlpyWords')

# train Word2vec
embedding_model = Word2Vec(sentences=konlpyWords, window = 2, min_count=50, workers=7, sg=1)

# save
embedding_model.wv.save_word2vec_format('w2v') 



In [11]:
# extract 20 keywords related to a word.
loaded_model = KeyedVectors.load_word2vec_format("w2v") 
keywordsWord2Vec = loaded_model.most_similar(positive=['도커'],topn=10)
keywordsWord2Vec

[('컴포즈', 0.7917843461036682),
 ('컨테이너', 0.7492018342018127),
 ('컨테이너화', 0.7179580926895142),
 ('쿠버네티스', 0.7150908708572388),
 ('스웜', 0.7100293636322021),
 ('오케스트레이션', 0.6814238429069519),
 ('메소스', 0.6800644993782043),
 ('앤서블', 0.6697981357574463),
 ('레지스트리', 0.6672812700271606),
 ('젠킨스', 0.6596615314483643)]

## Load

In [5]:
import pymysql
from sqlalchemy import create_engine

#sqlalchemy 연결 
db_connection_str = 'mysql+pymysql://root@localhost:3306/dash_test'
db_connection = create_engine(db_connection_str)
conn = db_connection.connect()


### backend_dodomoalibinfo

* Libinfo has `ISBN` and `지역` column.

* This table is used for searching books of libraries that user selects .
* ISBN is id of books, so it is easy to get info at other tables

In [None]:
# select ISBN and 지역 column of rawbookinfo
booklist = rawbookinfo[['ISBN','지역']]


# save to backend_dodomoalibinfo table
booklist.to_sql(name='backend_dodomoalibinfo', con=db_connection, if_exists='append',index=False)

### backend_dodomoabookinfo
* Bookinfo has `도서명`,`저자`,`출판사`,`ISBN`,`주제분류번호`,`등록일자`,`이미지주소` column.

* This has books of all libraries.

In [None]:
# newdf = pd.read_csv('./data/tempdata.csv',index_col=0)
backend_dodomoabookinfo = newdf[['도서명','저자','출판사','ISBN','주제분류번호','등록일자','이미지주소']]

backend_dodomoabookinfo['출판사'] = backend_dodomoabookinfo['출판사'].fillna('-')
backend_dodomoabookinfo['주제분류번호'] = '00'+ backend_dodomoabookinfo['주제분류번호'].astype(str)

backend_dodomoabookinfo.to_sql(name='backend_dodomoabookinfo', con=db_connection, if_exists='append',index=False)

### backend_dodomoakeyword2
* Keyword2 has `ISBN` and `keywords` column.

* all keywords are joined and saved as a string.
* keywords column has fulltext index
* This table returns ISBNs that match with user search keywords.

In [None]:
backend_dodomoakeyword2 = dfWithKeywords[['ISBN','keywords']]
backend_dodomoakeyword2['keywords'] = list(map(lambda x : ' '.join(x),backend_dodomoakeyword2['keywords']))

backend_dodomoakeyword2.columns = ['ISBN','keyword']

backend_dodomoakeyword2.to_sql(name='backend_dodomoakeyword2', con=db_connection, if_exists='append',index=False)