In [None]:
import sys
sys.path.append('../')

from transformers import ElectraModel, ElectraTokenizerFast
from modeling import keywordExtractor
import pandas as pd

model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-base-v3-discriminator")

# load data
raw_data = pd.read_parquet("../../data/book_scraping.parquet")

key = keywordExtractor(model, tokenizer)

##### 테스트용

In [None]:
# raw_data = pd.read_csv("../../data/raw_book_info_list.csv",index_col=0)
# from ast import literal_eval
# book_info = raw_data.iloc[3]

# book_info['book_toc'] = literal_eval(book_info['book_toc'])
# book_info['book_intro'] = literal_eval(book_info['book_intro'])
# book_info['publisher'] = literal_eval(book_info['publisher'])
# book_info['isbn13'] = 1

# book_info.index = ['title','toc','intro','publisher','isbn13']
# book_info = pd.DataFrame(book_info).T

In [None]:
# book_info = raw_data.iloc[:1]
# keywords = map(lambda x : key.extract_keywords(x[1]),book_info.iterrows())
# tokenized_keywords = map(key.tokenize_keywords,keywords)
# keywords_embedding = list(map(key.create_keyword_embedding,tokenized_keywords))

In [None]:
# # book_info = raw_data.iloc[:10]
# stringfied_doc = map(lambda x : convert_series_to_str(x[1]),book_info.iterrows())
# tokenized_doc = map(key.tokenize_keywords,stringfied_doc)
# doc_embedding = list(map(lambda x : sbert(**x)["sentence_embedding"],tokenized_doc))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from itertools import islice
import numpy as np


def _calc_cosine_similarity(doc_embedding:torch.Tensor, keywords_embedding:torch.Tensor) -> np.array:
    doc_embedding = doc_embedding.unsqueeze(0).detach()
    keywords_embedding = keywords_embedding.detach()

    doc_score = list(map(lambda x: cosine_similarity(x, keywords_embedding), doc_embedding))

    max_pooling = np.max(doc_score, axis=0)  # Max
    return max_pooling


def _filter_top_n_keywords(keyword_list:list,co_sim_score:np.array,rank:int=20) -> list :
    keywords = dict(zip(keyword_list,co_sim_score))
    sorted_keywords = sorted(keywords.items(), key=lambda k: k[1], reverse=True)
    return list(dict(islice(sorted_keywords,rank)).keys())


In [None]:
data = raw_data.iloc[:2]

def extract_keywords(docs:pd.DataFrame) -> dict:
    keywords_embedding = map(lambda x : key.create_keyword_embedding(x[1]),docs.iterrows())
    doc_embedding = map(lambda x : key.create_doc_embeddings(x[1]),docs.iterrows())
    keyword_list = map(lambda x : key._extract_keywords(x[1]),docs.iterrows())

    co_sim_score = map(lambda x : _calc_cosine_similarity(*x).flatten(),zip(doc_embedding,keywords_embedding))
    top_n_keywords = list(map(lambda x : _filter_top_n_keywords(*x),zip(keyword_list,co_sim_score)))

    return dict(zip(docs['isbn13'].values,top_n_keywords))

extract_keywords(data)

In [None]:
from gensim.models import keyedvectors

model = keyedvectors.load_word2vec_format('w2v_old') 

model.most_similar(positive=['파이선'],topn=20)

In [1]:
import pandas as pd
from w2v import W2VTrainer
from kiwipiepy import Kiwi

k = Kiwi()
w2v = W2VTrainer()

df = pd.read_parquet('../../data/book_scraping.parquet')
y = w2v.create_w2v_data(df)

In [2]:
from gensim.models import Word2Vec
from itertools import chain
embedding_model = Word2Vec(sentences=y, window=2, min_count=30, workers=7, sg=1)

In [13]:
# idx
embedding_model.wv.has_index_for('mysql')

embedding_model.wv.most_similar(positive=['bert'],topn=20,)

The slowest run took 7.19 times longer than the fastest. This could mean that an intermediate result is being cached.
844 µs ± 305 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [4]:
embedding_model.wv.most_similar(positive=['파이선'],topn=20)

KeyError: "Key '파이선' not present in vocabulary"

### 시작

In [None]:
# from collections import Counter

# Counter(
#     list(
#         chain(
#             *map(
#                 lambda x: x.split(" "),
#                 (
#                     chain(
#                         df.query('isbn13 == "9791127459321"')
#                         .iloc[0]
#                         .drop(["title", "isbn13"])
#                         .values[0]
#                     )
#                 ),
#             )
#         )
#     )
# )


In [None]:
series = df.iloc[0]
# series = df.query('isbn13 == "9791127459321"').iloc[0]
extract_keyword_list(series)

In [None]:
from collections import Counter
from typing import Iterable,List

noun_extractor = Kiwi(model_type="knlm")
dir = '../../data/preprocess/eng_han.csv'
def extract_keyword_list(doc: pd.Series, min_count: int = 3, min_length: int = 2) -> list:
    raw_data = _convert_series_to_keyword_list(doc)
    keyword_list = _extract_keywords(raw_data)
    translated_keyword_list = _map_english_to_hangeul(keyword_list)
    refined_keyword_list = _eliminate_min_words(translated_keyword_list,min_count)
    return list(filter(lambda x: len(x) >= min_length, refined_keyword_list))

def convert_series_to_str( series: pd.Series) -> str:
    book_title = series["title"]
    series = series.drop(["title", "isbn13"])
    return book_title + " " + " ".join(list(chain(*series.values)))


def _convert_series_to_keyword_list(series: pd.Series) -> List[List[str]]:
    book_title = series["title"]
    series = series.drop(["title", "isbn13"])

    raw_data = [book_title] + list(chain(*series.values))
    return list(chain(*map(lambda x: x.split(), raw_data)))


def _extract_keywords(words: List[str]) -> List[List[str]]:
    tokenized_words = noun_extractor.tokenize(" ".join(words)) 
    return [word.form for word in tokenized_words if word.tag in ("NNG", "NNP","SL")]


def _map_english_to_hangeul(word_list: list[str]) -> list[str]:
    eng_han_df = pd.read_csv(dir).dropna()
    eng_han_dict = dict(eng_han_df.values)

    def map_eng_to_han(word: str, eng_han_dict: dict) -> str:
        han_word = eng_han_dict.get(word)
        return han_word if han_word else word

    return list(map(lambda x: map_eng_to_han(x.lower(), eng_han_dict), word_list))

def _eliminate_min_words(candidate_keyword, min_count:int =3) :
    refined_han_words = filter(lambda x: x[1] >= min_count, Counter(candidate_keyword).items())
    return list(map(lambda x : x[0],refined_han_words))



In [None]:
import pandas as pd
df = pd.read_parquet('../../data/book_scraping.parquet')
series = df.iloc[:2]
print(create_w2v_data(series))

In [None]:
from kiwipiepy import Kiwi
from typing import List, Iterable
from itertools import chain

noun_extractor = Kiwi(model_type="knlm")
dir = "../../data/preprocess/eng_han.csv"


def create_w2v_data(df: pd.DataFrame, min_length: int = 2) -> list[list]:
    keyword_list = list(map(lambda x: _convert_series_to_str(x[1]), df.iterrows()))
    noun_only_data = _extract_w2v_data(keyword_list)
    w2v_data = list(map(_map_english_to_hangeul, noun_only_data))
    refined_w2v_data = list(map(lambda x: _eliminate_min_length_word(x, min_length), w2v_data))
    return refined_w2v_data


def _convert_series_to_str(series: pd.Series) -> str:
    book_title = series["title"]
    series = series.drop(["title", "isbn13"])
    return book_title + " " + " ".join(list(chain(*series.values)))


def _extract_w2v_data(words: Iterable[str]) -> List[List[str]]:
    tokenized_words = noun_extractor.tokenize(words)
    result = []
    for lst in tokenized_words:
        words = [word.form for word in lst if word.tag in ("NNG", "NNP", "SL")]
        result.append(words)
    return result


def _map_english_to_hangeul(word_list: List[str]) -> list[str]:
    eng_han_df = pd.read_csv(dir).dropna()
    eng_han_dict = dict(eng_han_df.values)

    def _map_eng_to_han(word: str, eng_han_dict: dict) -> str:
        han_word = eng_han_dict.get(word)
        return han_word if han_word else word

    return list(map(lambda x: _map_eng_to_han(x.lower(), eng_han_dict), word_list))


def _eliminate_min_length_word(w2v_data, min_length: int = 2) -> List[str]:
    return list(filter(lambda x: len(x) >= min_length, w2v_data))


In [None]:
from kiwipiepy import Kiwi
from typing import List, Iterable
from itertools import chain

noun_extractor = Kiwi(model_type="knlm")
dir = "../../data/preprocess/eng_han.csv"


def create_w2v_data(self, df: pd.DataFrame,min_length:int=2) -> list[list]:
    keyword_list = list(map(lambda x: _convert_series_to_str(x[1]), df.iterrows()))
    noun_only_data = _extract_w2v_data(keyword_list)
    w2v_data = list(map(_map_english_to_hangeul, noun_only_data))
    return list(filter(lambda x: len(x) >= min_length, w2v_data))


def _convert_series_to_str(self, series: pd.Series) -> str:
    book_title = series["title"]
    series = series.drop(["title", "isbn13"])
    return book_title + " " + " ".join(list(chain(*series.values)))


def _extract_w2v_data(self, words: Iterable[str]) -> List[List[str]]:
    tokenized_words = self.noun_extractor.tokenize(words)
    result = []
    for lst in tokenized_words:
        words = [word.form for word in lst if word.tag in ("NNG", "NNP", "SL")]
        result.append(words)
    return result


def _map_english_to_hangeul(self, word_list: List[str]) -> list[str]:
    eng_han_df = pd.read_csv(self.dir).dropna()
    eng_han_dict = dict(eng_han_df.values)

    def _map_eng_to_han(word: str, eng_han_dict: dict) -> str:
        han_word = eng_han_dict.get(word)
        return han_word if han_word else word

    return list(map(lambda x: _map_eng_to_han(x.lower(), eng_han_dict), word_list))
