In [47]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle as pkl

In [3]:
from sklearn.decomposition import TruncatedSVD
from soynlp.vectorizer import sent_to_word_contexts_matrix

### 1. 문장 토큰화된 상태의 코퍼스

In [4]:
case_fraud = pd.read_csv('../1_Data/case_fraud_sent.csv', index_col=0)
case_fraud.head(1)

Unnamed: 0,판례일련번호,판례내용_전처리2
0,238021,상고이유를 판단한다. 불가벌적 사후행위에 대한 판단유사수신행위의 규제에 관한 법...


In [29]:
corpus = [case for case in case_fraud.판례내용_전처리2]

In [31]:
input_matrix, index2voca = sent_to_word_contexts_matrix(corpus, windows=3, min_tf=10, dynamic_weight=True, verbose=True)

svd = TruncatedSVD(n_components=10)
vectors = svd.fit_transform(input_matrix)

Create (word, contexts) matrix
  - counting word frequency from 2252 sents, mem=0.762 Gb
  - scanning (word, context) pairs from 2252 sents, mem=0.895 Gb
  - (word, context) matrix was constructed. shape = (20036, 20036)                    
  - done


In [32]:
np.shape(svd.components_) # 토픽 개수 (V^T) x 단어 개수

(10, 20036)

In [33]:
terms = index2voca # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
        
get_topics(svd.components_,terms)

Topic 1: [('피고인', 0.61875), ('공소외', 0.54379), ('이', 0.25441), ('의', 0.2212), ('은', 0.1356)]
Topic 2: [('공소외', 0.70189), ('회사', 0.12233), ('피해자', 0.10141), ('회사의', 0.0977), ('주식회사', 0.07046)]
Topic 3: [('사건', 0.6609), ('부분', 0.18352), ('피고인', 0.10697), ('공소외', 0.08931), ('사건에서', 0.02088)]
Topic 4: [('사건', 0.643), ('이', 0.63671), ('부분', 0.18834), ('공소사실', 0.06281), ('각', 0.05634)]
Topic 5: [('선고', 0.4751), ('도', 0.45807), ('.', 0.4457), ('판결', 0.41546), ('대법원', 0.20574)]
Topic 6: [('수', 0.49074), ('할', 0.30054), ('없다.', 0.25364), ('있는', 0.181), ('볼', 0.1748)]
Topic 7: [('억', 0.46432), ('만', 0.46096), ('원을', 0.39954), ('원', 0.35655), ('합계', 0.16737)]
Topic 8: [('대한', 0.36371), ('에', 0.29885), ('각', 0.25692), ('같은', 0.2262), ('위와', 0.1891)]
Topic 9: [('공소외', 0.29356), ('피고인', 0.27298), ('대한', 0.16527), ('형사소송법', 0.07693), ('선고', 0.07563)]
Topic 10: [('할', 0.32058), ('없다.', 0.29568), ('있는', 0.25747), ('볼', 0.21004), ('인정할', 0.20351)]


### 2. 단어 토큰화 진행 후 코퍼스

In [34]:
case_fraud = pd.read_csv('../1_Data/case_fraud_prep.csv', index_col=0)
case_fraud.head(1)

Unnamed: 0_level_0,판례일련번호,사건명,사건번호,선고일자,법원명,사건종류명,사건종류코드,판결유형,선고,판례상세링크,판시사항,판결요지,참조조문,참조판례,판례내용,판례내용_전처리,판례내용_전처리_한자,판례내용_전처리_글자수,판례내용_토큰화_코모란,판례내용_토큰화_꼬꼬마
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
20,238021,특정경제범죄가중처벌등에관한법률위반(사기)·사기,2023도12424,2023.11.16,대법원,형사,400102.0,판결,선고,/DRF/lawService.do?OC=yeorii&target=prec&ID=23...,"유사수신행위를 금지·처벌하는 유사수신행위의 규제에 관한 법률 제6조 제1항, 제3...","유사수신행위의 규제에 관한 법률(이하 ‘유사수신행위법’이라 한다) 제6조 제1항, ...","유사수신행위의 규제에 관한 법률 제3조, 제6조 제1항, 형법 제37조, 제347조...",내용없음,【피 고 인】 피고인【상 고 인】 피고인【변 호 인】 변호사 장용배【배상신청인】 배...,상고이유를 판단한다 불가벌적 사후행위에 대한 판단유사수신행위의 규제에 관한 ...,상고이유를 판단한다. 1. 불가벌적 사후행위에 대한 판단「유사수신행위의 규제에...,1066,"['상고', '이유', '를', '판단', '하', 'ㄴ다', '불가벌적 사후행위'...","['상고', '이유', '를', '판단', '하', 'ㄴ다', '불가', '벌', ..."


In [37]:
from konlpy.tag import Kkma
kkma = Kkma()

In [40]:
kkmaSent = []

for sent in tqdm(case_fraud.판례내용_전처리):
    kkmaVocab = ''
    kkmaTagList = kkma.pos(sent)
    for (text, pos) in kkmaTagList:
        if pos == 'NNG':
           kkmaVocab += ' ' + text
    kkmaSent.append(kkmaVocab)

100%|██████████| 2253/2253 [37:28<00:00,  1.00it/s]  


In [48]:
with open("../2_Preprocessing/vocab_list/kkmaCorpus.pkl","wb") as f:
    pkl.dump(kkmaSent, f)

In [42]:
input_matrix, index2voca = sent_to_word_contexts_matrix(kkmaSent, windows=5, min_tf=5, dynamic_weight=True, verbose=True)

svd = TruncatedSVD(n_components=10)
vectors = svd.fit_transform(input_matrix)

Create (word, contexts) matrix
  - counting word frequency from 2252 sents, mem=0.129 Gb
  - scanning (word, context) pairs from 2252 sents, mem=0.433 Gb
  - (word, context) matrix was constructed. shape = (9500, 9500)                    
  - done


In [43]:
np.shape(svd.components_) # 토픽 개수 (V^T) x 단어 개수

(10, 9500)

In [45]:
terms = index2voca # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=15):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
        
get_topics(svd.components_,terms)

Topic 1: [('피고인', 0.69541), ('공소', 0.36575), ('사실', 0.22215), ('위', 0.21398), ('원심', 0.15981), ('의', 0.13526), ('회사', 0.12585), ('공', 0.11784), ('소외', 0.11331), ('사건', 0.1094), ('이유', 0.10619), ('점', 0.10228), ('인정', 0.10212), ('판결', 0.09263), ('부분', 0.09197)]
Topic 2: [('공소', 0.53087), ('소외', 0.30785), ('회사', 0.30555), ('공', 0.29688), ('외', 0.16781), ('주식회사', 0.15263), ('피해자', 0.05135), ('명의', 0.04826), ('주식', 0.03982), ('대', 0.03485), ('자금', 0.03254), ('은행', 0.02902), ('로', 0.02782), ('이사', 0.02698), ('경', 0.0263)]
Topic 3: [('제조', 0.64139), ('제항', 0.45687), ('형법', 0.26359), ('공소', 0.2528), ('제호', 0.22529), ('회사', 0.09359), ('사실', 0.08974), ('소송법', 0.08834), ('법률', 0.08709), ('형사', 0.07704), ('외', 0.07318), ('범죄', 0.07036), ('전단', 0.06551), ('법', 0.05972), ('가중', 0.05865)]
Topic 4: [('피고인', 0.38245), ('공', 0.24589), ('소외', 0.24578), ('제조', 0.23793), ('제항', 0.15985), ('형법', 0.10956), ('의', 0.10438), ('주식회사', 0.08546), ('은', 0.08192), ('제호', 0.07678), ('경', 0.04986), ('대판', 0.04721), (