In [4]:
from transformers import ElectraModel, ElectraTokenizer
from konlpy.tag import Hannanum
import pandas as pd
import ast
import re

# model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
# tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [5]:
raw_data = pd.read_csv('data/raw_book_info_list.csv',index_col=0)

raw_data.head(2)

Unnamed: 0,book_title,book_toc,book_intro,publisher
0,한 권으로 끝내는 메타버스 크리에이터,"['메타버스란', '왜 메타버스인가', '메타버스의 유형을 알아보자', '메타버스 ...",[],[]
1,Do it! 점프 투 파이썬: 라이브러리 예제 편,"['', '텍스트 다루기', '문자열을 줄여 표시하려면 textwrap shorte...",['이 책은 Do it 점프 투 파이썬 의 박응용 저자가 그동안 수많은 독자에게 받...,['실무에서 자주 쓰는 파이썬 라이브러리는 다 있다 필수 파이썬 라이브러리 개 엄선...


## [데이터 전처리] 단어 키워드 추출에 맞는 Input 데이터 생산

### 1. 도서 정보를 하나의 string으로 넣기

In [6]:
def Merge_Series_to_str(series:pd.Series) -> str:

    '''
    column 내 list 자료형이 있을 때, csv 저장 시 str 타입으로 저장됨.
    ast.literal_eval 매서드를 새용해 str -> list로 다시 전환하는 매서드 추가

    type = 'str' 모든 문장을 하나의 string으로 저장
    type = 'list' 모든 문장을 하나의 list로 저장

    '''
    val_array = series.values

    lst = []
    for item in val_array :
        if item[0] == '[' :
            item = ast.literal_eval(item)
            lst.extend(item)
        else :
            lst.append(item)

    # 리스트 내 ''제거
    lst = list(filter(None,lst))

    print('변환한 도서정보 : ',lst[0])
    
    return re.sub(r'[^\w\s]', '', ' '.join(lst))
# 1100 파이토치 딥러닝 프로젝트 모음집
# 132 엑셀로 하는 데이터 분석
# 1 Do it! 점프 투 파이썬: 라이브러리 예제 편
# 3 그림과 실습으로 배우는 도커 & 쿠버네티스
book_info :str = Merge_Series_to_str(raw_data.iloc[1])

변환한 도서정보 :  Do it! 점프 투 파이썬: 라이브러리 예제 편


### 2. 일부 영단어를 한글로 변환하기

In [7]:
def trans_engwords_to_hanwords(words: str) -> list:
    # result = 문자열 복사
    result = words.split()
    EngToKorDict = pd.read_csv("data/englist.csv")
    for i in range(len(result)):
        lower_case = result[i].lower()
        if lower_case in EngToKorDict['eng'].tolist() :
            
            eng_to_kor= EngToKorDict[EngToKorDict['eng'] == lower_case]['kor'].values[0]
            print('변환 :', lower_case,' => ',eng_to_kor)
            result[i] = eng_to_kor
    return result


book_info_trans = trans_engwords_to_hanwords(book_info)



변환 : date  =>  날짜
변환 : json  =>  제이슨
변환 : json  =>  제이슨
변환 : server  =>  서버
변환 : cafe  =>  카페
변환 : naver  =>  네이버
변환 : github  =>  깃허브


### 3. 문장 내 영단어 제거 및 영단어 모아두기

In [8]:
# 한글 문자 리스트 추출
def find_han_words(text: str) -> list:
    return re.findall("[\u3130-\u318F\uAC00-\uD7A3]+", text)

def find_eng_words(text: str) -> list:
    return re.findall("[a-zA-Z]+", text)

book_info_han = find_han_words(' '.join(book_info_trans))
book_info_eng = find_eng_words(' '.join(book_info_trans))



### Konlpy로 명사만 추출하기

In [9]:
from konlpy.tag import Hannanum

han = Hannanum()

# str or list에 따른 tokenizing 방법
if type(book_info_han) == list :
    han_nouns = han.nouns(' '.join(book_info_trans))
    
else :
    print('book_info type must be list.')

### 문장을 대표할 후보 단어 고르기

In [10]:
# 문장 내 단어에 대한 value_counts
candidates = pd.DataFrame(han_nouns)[0].value_counts() 

# 3개 이상인 단어만 추출
candidate_words = candidates[candidates > 2].index.values.tolist()

### Sentence Transformer로 유사도 검색

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('monologg/koelectra-base-v3-discriminator')

No sentence-transformers model found with name /Users/yangwoolee/.cache/torch/sentence_transformers/monologg_koelectra-base-v3-discriminator. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/yangwoolee/.cache/torch/sentence_transformers/monologg_koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mo

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

doc_embedding = model.encode([' '.join(book_info_han)])
candidate_embeddings = model.encode(candidate_words)

distances = cosine_similarity(doc_embedding, candidate_embeddings)
pd.DataFrame(distances.T,index=candidate_words).sort_values(by=0,ascending=False)[:20]


Unnamed: 0,0
공식,0.894798
시간,0.842275
개발자,0.795263
깃허브,0.789461
바이너리,0.773035
사용,0.75171
파이썬,0.74949
저장,0.722581
병렬,0.71543
때,0.69734


### 고민
tokenizing 하지 않은 오리지널 문장을 docs로 두어야할 것인지

아니면 tokenizing한 문장을 docs로 두어야할 것인지..

In [507]:
# # max_token_length : 512
# tokenizing_docs_original = tokenizer.tokenize(' '.join(book_info_han))
# tokenizing_docs_konlpy = tokenizer.tokenize(' '.join(han_nouns))
# tokenizing_candidates = tokenizer.tokenize(' '.join(candidate_words))

# encoding_original = tokenizer.encode(tokenizing_docs_original,return_tensors="pt")[:1,:512]
# encoding_konlpy = tokenizer.encode(tokenizing_docs_konlpy,return_tensors="pt")[:1,:512]
# encoding_candidates = tokenizer.encode(tokenizing_candidates,return_tensors="pt")[:1,:512]



# electra_original = model(encoding_original)[0].squeeze(0).detach().numpy() # torch.Size([ 512, 768])
# electra_konlpy = model(encoding_konlpy)[0].squeeze(0).detach().numpy() # torch.Size([ 512, 768])
# electra_candidates = model(encoding_candidates)[0].squeeze(0).detach().numpy() 


### Cosine 유사도 실험중 

In [509]:
# from sklearn.metrics.pairwise import cosine_similarity

# distance_original = cosine_similarity(electra_original,electra_candidates)
# distance_konlpy = cosine_similarity(electra_konlpy,electra_candidates)

In [510]:
# decoding_candidates = tokenizer.decode(encoding_candidates[0]).split()

In [542]:
# pd.DataFrame(distance_konlpy.T)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,0.732819,0.248315,0.251682,0.253381,0.266203,0.291155,0.268032,0.262364,0.262900,0.271540,...,0.305247,0.297330,0.260703,0.282919,0.281639,0.292454,0.257638,0.271567,0.256121,0.732819
1,0.407831,0.751741,0.672615,0.785368,0.742085,0.445156,0.558662,0.822101,0.807550,0.772749,...,0.834117,0.844884,0.726873,0.668414,0.776618,0.894141,0.830699,0.863877,0.801337,0.407831
2,0.422736,0.836587,0.722945,0.765517,0.875395,0.592998,0.666369,0.884214,0.889834,0.827050,...,0.834637,0.768015,0.768665,0.741496,0.795688,0.850251,0.864719,0.846887,0.841673,0.422736
3,0.456230,0.787892,0.832194,0.762701,0.793401,0.909545,0.821563,0.750312,0.726505,0.735523,...,0.670517,0.608304,0.683988,0.670515,0.730371,0.673097,0.704872,0.716426,0.704853,0.456230
4,0.465781,0.792797,0.821126,0.874424,0.803995,0.754827,0.897694,0.815088,0.799946,0.821343,...,0.751693,0.686573,0.716089,0.710826,0.782929,0.763783,0.775757,0.782943,0.786079,0.465781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,0.459001,0.731031,0.742773,0.810245,0.757958,0.662624,0.741804,0.796028,0.787598,0.789990,...,0.760539,0.752832,0.719729,0.654001,0.785719,0.803183,0.795835,0.831230,0.778022,0.459001
57,0.386041,0.718125,0.682605,0.752835,0.704080,0.460929,0.552803,0.779339,0.767621,0.712863,...,0.784960,0.814703,0.704042,0.636094,0.724336,0.820354,0.799081,0.828633,0.753436,0.386041
58,0.391132,0.736671,0.657305,0.740680,0.742986,0.451781,0.552811,0.800931,0.793491,0.725612,...,0.836927,0.839299,0.746563,0.660160,0.765709,0.857146,0.844671,0.828408,0.769040,0.391132
59,0.423016,0.721379,0.662757,0.757192,0.738776,0.461578,0.556925,0.805338,0.792330,0.758737,...,0.835353,0.867001,0.745533,0.661098,0.778861,0.891005,0.829942,0.860951,0.784750,0.423016


In [511]:
# keywords_konlpy = [encoding_candidates[0][index].item() for index in distance_konlpy.argsort()[0]]
# keywords_original = [encoding_candidates[0][index].item() for index in distance_original.argsort()[0]]



# print(tokenizer.decode(keywords_original))
# print(tokenizer.decode(keywords_konlpy))

[SEP] [CLS] 비명 때문 분 것 수 이미지 설치치 구현 들 프로젝트 결론 다양 실전 이용 파이 클래스 러닝 진행 실험 책 설명 모델 실습한토 소개 관심 등 활용 문제 학습닝 인공지능 설계 머신 이론 딥 데이터 파트원닝 분류 전처 기초 내용 이해 코드론리링 국민 퍼러러셉트 크청롤
비명 분치 들 이미지토 것 프로젝트 수셉트 때문 파이 다양 딥 설치 머신 책청롤링 러닝 실습 설명 활용 진행 이용러 퍼 전처 구현 실험닝 국민 크러 등한 데이터 인공지능 기초닝 실전 결론 클래스 모델 이해원 코드론 소개 학습 문제 설계 분류 파트 관심리 이론 내용 [SEP] [CLS]


### Sentence Transformers 원리 이해하기