### 기본 세팅

In [1]:
import os
import torch
import numpy as np
import pandas as pd
import re
import ast

from tqdm.notebook import tqdm
from kss import split_sentences

from sentence_transformers import SentenceTransformer, util

[Korean Sentence Splitter]: Initializing Kss...


In [2]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

### 데이터 로드

In [3]:
df_original    = pd.read_csv('./dataset/사건원문.csv')
df_kor_ratio5  = pd.read_csv('./dataset/한글태그_5%적용.csv')
df_kor_ratio10 = pd.read_csv('./dataset/한글태그_10%적용.csv')
df_eng_ratio5  = pd.read_csv('./dataset/영어태그_5%적용.csv')
df_eng_ratio10 = pd.read_csv('./dataset/영어태그_10%적용.csv')

datanames = ['original', 'kor_ratio5', 'kor_ratio10', 'eng_ratio5', 'eng_ratio10']

In [4]:
# 문자열의 리스트화
def str2list(textdata, attribute):
    textdata = [ast.literal_eval(case) for case in textdata[attribute]]
    return textdata


# 데이터프레임 내용 리스트화
for dataname in datanames:
    rawdataframe = globals()[f'df_{dataname}']
    globals()[f'{dataname}_content'] = str2list(rawdataframe, '내용')

### ngram 세팅

In [5]:
# 입력 content를 ngram 리스트로 변환
def content2ngram(content, n):
    """
        content: 사건 내용 리스트 
        n: ngram으로 자를 단위
    """
    ngram_list = []     # 전체 텍스트 ngram
    for case in content:
        cut_count = len(case)-n     # ngram으로 자를 횟수
        ngram_case = []             # 사건당 ngram
        
        for i in range(cut_count):
            ngram_group = ''     # 합쳐질 ngram
            for j in range(n):
                if j != 0: ngram_group += ' '
                ngram_group += case[i+j]     # 문장 합쳐서 붙임
            ngram_case.append(ngram_group)
        ngram_list.append(ngram_case)
    return ngram_list

In [6]:
# 데이터 ngram 변환
for dataname in datanames:
    rawdata = globals()[f'{dataname}_content']
    for n in [2, 3, 4]:     # 2gram, 3gram, 4gram 데이터 생성
        globals()[f'{dataname}_content_{n}gram'] = content2ngram(rawdata, n)

### sentence transformer 임베딩

In [7]:
class SentenceTagger:
    def __init__(self):
        self.device = torch.device('cuda')
        self.embedding_model = None

        
    # SentenceTransformer 모델 로드
    def set_model(self):
        self.embedding_model = SentenceTransformer('./KoSentenceBERT_SKTBERT/output/training_con',
                                                   device=self.device)

        
    # 전체에서 사건별 임베딩
    def sentence_embedding(self, content):
        content_embedding = []     # 전체 임베딩
        for case in tqdm(content):
            case_embedding = []     # 사건별 임베딩
            for sent in case:
                sent_embedding = self.embedding_model.encode(sent, device=self.device)     # 문장별 임베딩
                case_embedding.append(np.array(sent_embedding, dtype=object))
            content_embedding.append(np.array(case_embedding, dtype=object))
        return np.array(content_embedding, dtype=object)

In [8]:
# 문장 임베딩 객체 생성
sent_tagger = SentenceTagger()
sent_tagger.set_model()

using cached model
using cached model
using cached model
Load Model


In [9]:
# 데이터 ngram별 임베딩
for dataname in datanames:
    for n in [2, 3, 4]:
        cur_content_ngram = globals()[f'{dataname}_content_{n}gram']
        globals()[f'{dataname}_{n}gram_embedding'] = sent_tagger.sentence_embedding(cur_content_ngram)
        print(f' {dataname} {n}gram embedding finished')

  0%|          | 0/341 [00:00<?, ?it/s]

 original 2gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 original 3gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 original 4gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 kor_ratio5 2gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 kor_ratio5 3gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 kor_ratio5 4gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 kor_ratio10 2gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 kor_ratio10 3gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 kor_ratio10 4gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 eng_ratio5 2gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 eng_ratio5 3gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 eng_ratio5 4gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 eng_ratio10 2gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 eng_ratio10 3gram embedding finished


  0%|          | 0/341 [00:00<?, ?it/s]

 eng_ratio10 4gram embedding finished


In [10]:
# 데이터 ngram별 임베딩 저장
for dataname in datanames:
    for n in [2, 3, 4]:
        cur_ngram_embedding = globals()[f'{dataname}_{n}gram_embedding']
        np.save(f'./save_embeddings/{dataname}_{n}gram_embedding.npy', np.array(cur_ngram_embedding, dtype=object))
        print(f' {dataname} {n}gram embedding saved')

 original 2gram embedding saved
 original 3gram embedding saved
 original 4gram embedding saved
 kor_ratio5 2gram embedding saved
 kor_ratio5 3gram embedding saved
 kor_ratio5 4gram embedding saved
 kor_ratio10 2gram embedding saved
 kor_ratio10 3gram embedding saved
 kor_ratio10 4gram embedding saved
 eng_ratio5 2gram embedding saved
 eng_ratio5 3gram embedding saved
 eng_ratio5 4gram embedding saved
 eng_ratio10 2gram embedding saved
 eng_ratio10 3gram embedding saved
 eng_ratio10 4gram embedding saved
