## 크롤링 함수

In [None]:
#!pip install -r /content/drive/MyDrive/requirements.txt
# !pip install folium==0.2.1
# !pip install urllib3==1.26.6
!pip install selenium
!apt-get update 
!apt install chromium-chromedriver

In [None]:
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
import os
import re


class GoogleSearcher:
    def __init__(self, implicitly_wait_time=10):
        self.implicitly_wait_time = implicitly_wait_time

        self.chrome_options = webdriver.ChromeOptions()
        self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")

    
    def __del__(self):
        if self.driver:
            self.driver.quit()

    
    def reset_driver(self):
        if self.driver:
            self.driver.quit() # 드라이버를 재활용 할거면 닫지말아야한다.
            
        chrome_driver_path = "chromedriver"
        
        self.driver = webdriver.Chrome(chrome_driver_path, options=self.chrome_options)
        self.driver.implicitly_wait(self.implicitly_wait_time)


    def get_page_soup(self, word):
        url = f"https://www.google.com/search?q={quote_plus(word)}"

        self.driver.get(url)

        page_html = self.driver.page_source
        page_soup = BeautifulSoup(page_html, "html.parser")
        return page_soup


    def search(self, word, recursion=False):
        self.reset_driver()
        page_soup = self.get_page_soup(word)

        modifier = page_soup.select_one('a.gL9Hy')
        if modifier:
            word = modifier.text
            page_soup = self.get_page_soup(word)

        data = page_soup.select(".g")
        if not data:
            if recursion:
                print(word)
            else:
                self.driver.quit()
                sleep(3)
                return self.search(word, True)

        result = []
        for g in data:
            title = g.select_one(".LC20lb")
            if title:
                result.append(title.text)  # 타이틀
            if g.find("div", attrs={"class": "VwiC3b"}):
                result.append(g.select_one(".VwiC3b").text)  # 내용

        return result, word


    def search_highlighted(self, word, recursion=False):
        self.reset_driver() # 매번 리셋하지않으면 많은 트래픽으로 봇으로 차단당함
        page_soup = self.get_page_soup(word)

        modifier = page_soup.select_one('a.gL9Hy')
        if modifier:
            word = modifier.text
            modifier = modifier.text
            page_soup = self.get_page_soup(word)

        data = page_soup.find_all("em")
        if not data:
            if recursion:
                print(word)
            else:
                self.driver.quit()
                sleep(3)
                return self.search_highlighted(word, True)

        result = []
        for element in data:
            result.extend(element.text.split())

        return result, modifier

In [None]:
google_searcher = GoogleSearcher()
google_searcher.search('도어밸트끼임수리')

(['안전벨트가 풀리지 않습니다. 어떻게 해야 하나요? DIY 안전 ...',
  '11 okt. 2019 — 짧은 Phillips 드라이버를 사용하여 중간 도어 기둥의 바닥판에 있는 나사 4개를 풉니다. 그런 다음 덮개를 살짝 위로 살짝 당겨 제거합니다. 수리의 다음\xa0...',
  '안전벨트가 풀리지 않습니다. 어떻게 해야 하나요? DIY 안전 ...',
  '11 okt. 2019 — 짧은 Phillips 드라이버를 사용하여 중간 도어 기둥의 바닥판에 있는 나사 4개를 풉니다. 그런 다음 덮개를 살짝 위로 살짝 당겨 제거합니다. 수리의 다음\xa0...',
  '다양한 수리 솔루션! 관성 벨트 오작동의 주요 원인',
  '29 sep. 2019 — 그러나 자가 수리 안전 벨트는 가계 예산에서 상당한 돈을 절약하는 데 도움이 될 것 ... 도어 필러 (중간)의 하단 트림에서 4 개의 볼트가 풀립니다.',
  '수리 정비시 끼임 사고를 예방합시다 - THIS4',
  '10 mrt. 2021 — ○ 기인물별로는 벨트컨베이어, 천장크레인, 지게차 순으로 사망재해가 많이 발생했으며, 방호설비 설치대상 132건 중 미설치로 인한 사망건수가 115건(\xa0...',
  'K5(DL3) 흡습시 아웃핸들 내부 간극이 좁아져 도어 오픈시 리턴 ...',
  '14 okt. 2020 — 자동차 리콜 정보(기아) · K5(DL3) 흡습시 아웃핸들 내부 간극이 좁아져 도어 오픈시 리턴이 안되고 끼임 발생 가능 무상수리 · 댓글 · 자동차 리콜 정보(..',
  '카니발 yp 슬라이딩도어 리콜 - 카카오프렌즈 색칠공부',
  '24 mei 2018 — ... 제작하여 판매한 카니발(YP) 224,615대는 파워 슬라이딩 도어 내 끼임 ... 서비스센터에서 무상으로 수리(소프트웨어 업데이트)를 받을 수 있다.',
  "경 고 [한국소비자원'비충돌사고(Non-Crash Incident) 저감'경고 ...",
  '28 apr. 2021 — 도어/트렁크/유리창을 열고 닫을 때 

In [None]:
google_searcher.search_highlighted('전파넬보수')

(['전판넬보수',
  '전판넬',
  '판넬보수',
  '보수',
  '보수',
  '전',
  '판넬',
  '보수',
  '판넬',
  '전',
  '판넬',
  '보수',
  '전',
  '판넬',
  '판넬',
  '보수',
  '보수',
  '전',
  '판넬',
  '판넬',
  '보수',
  '판넬',
  '전',
  '전',
  '판넬보수',
  '판넬보수',
  '판넬보수',
  '판넬',
  '전',
  '보수'],
 None)

## 기능 구현

### 제목과 문장 텍스트를 보는 경우

In [None]:
def create_ngram_dict(word):
    result = dict()

    words = word.split()
    combo = ''.join(words)
    
    start = 0
    for word in words:
        for i in range(len(word)):
            result[start+i] = [word[i:i+n] for n in range(1, len(word)+1-i)]
        start += len(word)

    return result, combo

In [None]:
def create_max_combos(combo, ngram_dict, score_multipe=2):
    len_combo = len(combo)
    max_combos_list = [(-1, None)] * len_combo  # (최대점수, 그 경우 가능한 토큰조합들)

    def find_max_combos(idx=0):
        if max_combos_list[idx][0] == -1:
            max_score = score_multipe
            tokens = []
            for token, score in ngram_dict[idx]:
                assert(idx + len(token) <= len_combo)

                if idx + len(token) < len_combo:
                    score +=find_max_combos(idx + len(token))
                if max_score == score:
                    tokens.append(token)
                elif max_score < score:
                    tokens = [token]
                    max_score = score

            max_combos = []
            if len(tokens) == 0:
                max_score = 0
                max_combos.append([combo[idx:]])
            else:
                for token in tokens:
                    if idx+len(token) >= len_combo:
                        max_combos.append([token])
                    else:
                        for max_combo in max_combos_list[idx + len(token)][1]:
                            max_combos.append([token] + max_combo)
                
            max_combos_list[idx] = (max_score, max_combos)
        
        return max_combos_list[idx][0]

    max_score = find_max_combos()
    return max_score, max_combos_list[0][1], max_combos_list

In [None]:
import re

def tokenize_combo(searcher, combo, multiple=5, m_penalty=1, threshold=None):
    '''
    m_penalty : 검색에서 등장하지 않는 토큰에 대해 주는 패널티
    threshold : 최고 점수가 아니더라도 이 최소 등장 빈도를 넘길 경우,
                잘라서 토큰화시킨다.
    '''
    words, combo = searcher.search(combo)
    words = re.sub(r'[^가-힣]', ' ', ' '.join(words)).split()

    ngram_dict, combo = create_ngram_dict(combo)
    for i, tokens in ngram_dict.items():
        ngram_dict[i] = sorted([(t, words.count(t)*multiple) for t in tokens], key=lambda x: x[1], reverse=True)
        ngram_dict[i] = [(t, c) if c > 0 else (t, c-m_penalty)  for t, c in ngram_dict[i]]

    score, max_combo, combos_info = create_max_combos(combo, ngram_dict, multiple)
    return max(max_combo, key=lambda x: -len(x)), score

In [None]:
tokenize_combo(google_searcher, '전파넬보수')

(['전', '파넬', '보수'], 70)

### 유틸

In [None]:
!pip install jamo
from jamo import h2j, j2hcj


def calc_distance(a, b):
    ''' 레벤슈타인 거리 계산하기 '''
    if a == b:
        return 0  # 같으면 0을 반환

    a, b = j2hcj(h2j(a)), j2hcj(h2j(b))  # 자모 단위로 나누기
    a_len = len(a)  # a 길이
    b_len = len(b)  # b 길이
    if a == "":
        return b_len
    if b == "":
        return a_len
    # 2차원 표 (a_len+1, b_len+1) 준비하기
    matrix = [[] for i in range(a_len+1)]
    for i in range(a_len+1):  # 0으로 초기화
        matrix[i] = [0 for j in range(b_len+1)]
    # 0일 때 초깃값을 설정
    for i in range(a_len+1):
        matrix[i][0] = i
    for j in range(b_len+1):
        matrix[0][j] = j

    for i in range(1, a_len+1):
        ac = a[i-1]
        for j in range(1, b_len+1):
            bc = b[j-1]
            cost = 0 if (ac == bc) else 1
            matrix[i][j] = min([
                matrix[i-1][j] + 1,     # 문자 삽입
                matrix[i][j-1] + 1,     # 문자 제거
                matrix[i-1][j-1] + cost  # 문자 변경
            ])

    return matrix[a_len][b_len]

Collecting jamo
  Downloading jamo-0.4.1-py3-none-any.whl (9.5 kB)
Installing collected packages: jamo
Successfully installed jamo-0.4.1


In [None]:
def is_left_more_similar(trg, left, right):
    return calc_distance(trg, left) < calc_distance(trg, right)

### 구글 자체 강조 단어만을 보는 경우

In [None]:
def combine_max_combo(combo, likely_dict):
    len_combo = len(combo)
    max_combos_list = [(-1, None)] * len_combo  # (최대점수, 그 경우 가능한 토큰조합들)

    def find_max_combos(idx=0):
        if max_combos_list[idx][0] == -1:
            tokens = []
            max_score = 0
            single_score, single_token = 0, None
            for token, score in likely_dict[idx]:
                if idx + len(token) > len_combo:
                    continue
                elif idx + len(token) < len_combo:
                    score += find_max_combos(idx + len(token))
                    if score <= 0:
                        continue
                else:
                    if score == single_score:
                        if is_left_more_similar(combo[idx:], token, single_token):
                            single_score, single_token = score, token
                    elif score > single_score:
                        single_score, single_token = score, token

                if max_score == score:
                    tokens.append(token)
                elif max_score < score:
                    tokens = [token]
                    max_score = score

            max_combos = []
            for token in tokens:
                if idx + len(token) == len_combo:
                    max_combos.append([token])
                else:
                    for max_combo in max_combos_list[idx + len(token)][1]:
                        max_combos.append([token] + max_combo)

            if single_token and single_score < max_score:
                max_combos.append([single_token])
            max_combos_list[idx] = (max_score, max_combos)
        
        return max_combos_list[idx][0]

    if find_max_combos() != 0:
        return sorted(max_combos_list[0][1], key=lambda x: len(x))

In [None]:
from collections import defaultdict

def create_likely_dict(combo, words):
    result = defaultdict(list)

    for token in set(words):
        starts = []
        min_score = 99
        for i in range(len(combo)):
            score = calc_distance(token, combo[i:i+len(token)]) # todo: 끝에는 잘라서 비교해보기
            if score == min_score:
                starts.append(i)
            elif score < min_score:
                starts = [i]
                min_score = score
        
        for i in starts:
            result[i].append((token, words.count(token)))
    
    return result

In [None]:
def tokenize_multiple_combo(searcher, combo):
    words, _ = searcher.search_highlighted(combo)
    likely_dict = create_likely_dict(combo, words)

    return combine_max_combo(combo, likely_dict)

In [None]:
tokenize_multiple_combo(google_searcher, '공기압보충후')

[['공기압', '보충후'], ['공기압', '보충', '후']]

### 동사등이 섞인 문장에서 추출하는 경우

In [None]:
!pip install sentencepiece
!pip install konlpy
!pip install customized_konlpy

In [None]:
import sentencepiece as spm
from konlpy.tag import Kkma, Komoran
from ckonlpy.tag import Twitter

kkma = Kkma()

okt = Twitter()
okt.dictionary._pos2words = {}


def add_except_konlpy(okt):
    okt.dictionary._pos2words = {}
    with open('/content/drive/MyDrive/customized_konlpy/okt_nouns.txt', 'r') as f:
        nouns_okt = f.read().split()

    with open('/content/drive/MyDrive/customized_konlpy/kkma_nouns.txt', 'r') as f:
        nouns_kkma = f.read().split()

    okt.add_dictionary(nouns_okt, 'Noun')
    okt.add_dictionary(['하니'], 'Verb')
    okt.add_dictionary(['너무', '아예'], 'Adverb')

    return nouns_kkma


kkma_filter = add_except_konlpy(okt)

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [None]:
s = '전파넬교환'
print(kkma.pos(s))
print(okt.pos(s))

[('전', 'NNG'), ('파넬', 'NNG'), ('교환', 'NNG')]
[('전파', 'Noun'), ('넬', 'Noun'), ('교환', 'Noun')]


In [None]:
s = '정지에서출발할때떨림발생건'
a, m = google_searcher.search_highlighted(s)
print(s)
a

정지에서출발할때떨림발생건


['떨림',
 '에서',
 '떨림',
 '출발',
 '할',
 '때',
 '떨림',
 '에',
 '에서',
 '정지',
 '에',
 '에서',
 '할',
 '출발',
 '할때',
 '정지',
 '에서',
 '떨림',
 '정지',
 '출발',
 '정지할때',
 '할때',
 '할때',
 '떨리는',
 '출발',
 '에서',
 '에',
 '떨림',
 '출발',
 '때',
 '에',
 '할',
 '때는',
 '출발',
 '정지',
 '정지',
 '출발',
 '에',
 '할',
 '때']

In [None]:
from collections import defaultdict

def create_continuous_likely_dict(combo, words):
    tokens_list = defaultdict(list)
    len_combo = len(combo)

    # 각 키워드의 위치찾기
    for token in sorted(list(set(words)), key=lambda x: len(x)):
        len_token = len(token)
        min_score = 99
        starts = []
        for i in range(len_combo):
            score = calc_distance(token, combo[i:i+len_token])
            if score == min_score:
                starts.append(i)
            elif score < min_score:
                starts = [i]
                min_score = score
        
        for i in starts:
            tokens_list[i].append((token, len_token, words.count(token), min_score))

    # 자모 편집거리가 최소가 되는 키워드 조합구성
    new_combo = [' '] * len_combo

    for s in sorted(tokens_list.keys()):
        mt, ml, mc, ms = ' ', 1, 'except_13', 99

        tokens_list[s].sort(key=lambda x: x[1])
        for token, len_token, count, score in tokens_list[s]:
            if ml == len_token:
                if score < ms:
                    mt, ml, mc, ms = token, len_token, count, score
                # elif score == ms:
                #     print(f'except 같은 범위에 다른 글자인데, 편집거리가 같음 : {combo}, {i}:{token},{mt}')
            else: # ml < len_token
                if token[:ml] != mt:
                    local_score = calc_distance(token[:ml], combo[s:s+ml])
                    if local_score < ms:
                        mt, ml, mc, ms = token, len_token, count, score
                    # elif local_score == ms:
                    #     print(f'except 같은 범위에 다른 글자인데, 편집거리가 같음 : {combo}, {i}:{token},{mt}')
                else:
                    mt, ml, mc, ms = token, len_token, count, score

        for i, char in enumerate(mt[:len_combo-s], start=s):
            if new_combo[i] != ' ' and char != new_combo[i]:
                if calc_distance(new_combo[i], combo[i]) < calc_distance(char, combo[i]):
                    break
            new_combo[i] = char

    new_combo = ''.join(new_combo)

    # 키워드를 채우고 빈 공간 채우기(점수는 None)
    check_list = [False] * len_combo
    result = dict()#defaultdict(list)

    for i, tokens in tokens_list.items():
        temp = []
        max_len = 0
        for token, len_token, count, _ in tokens:
            if token in new_combo:
                temp.append((token, count))
                max_len = max(max_len, len_token)

        if len(temp) > 0:
            result[i] = temp
            for j in range(i, min(i+max_len, len_combo)):
                check_list[j] = True

    for i, b in enumerate(check_list):
        if b == False:
            e = i + 1
            for j in range(e, len_combo):
                if check_list[j]:
                    break
                e += 1
                check_list[j] = True
            new_combo = new_combo[:i] + combo[i:e] + new_combo[e:]
            assert(i not in result.keys())
            result[i] = [(combo[i:e], 0)]

    return result, new_combo

In [None]:
b, s = create_continuous_likely_dict(s, a)
print(s)
b

정지에서출발할때떨림발생건


{0: [('정지', 5)],
 2: [('에', 5), ('에서', 5)],
 4: [('출발', 7)],
 6: [('할', 4), ('할때', 3)],
 7: [('때', 3)],
 8: [('떨림', 5)],
 10: [('발생건', 0)]}

In [None]:
delete_tags = ['Adjective', 'Adverb', 'Conjunction', 'Eomi', 'Josa', 'Verb', 'VerbPrefix']

def fill_tokens(combo, likely_dict, pos_tags):
    for _ in range(10): # while -> 예외 시 무한 루프 방지
        add_list = []
        remove_list = []

        starts = set(likely_dict.keys())
        starts.add(len(combo))
        ends = {0,}

        for s in sorted(likely_dict.keys()):
            if s not in ends:   # 완료입니다 -> (완료x)입니다
                before = None
                is_before_noun = False
                is_after_noun = False
                ts, te, tl = 0, 0, 0
                for i, (t, p) in enumerate(pos_tags):
                    te = ts + len(t)
                    if te == s: # 앞 토큰(인덱스로 끝나는 토큰)
                        ls = te
                        for t, p in pos_tags[:i+1][::-1]:
                            if not p in delete_tags:
                                is_before_noun = True
                            ls -= len(t)
                            if ls in starts:
                                before = (ls, combo[ls:te])
                                break
                    elif ts == s: # 뒷 토큰(인덱스로 시작하는 토큰)
                        le = ts
                        for t, p in pos_tags[i:]:
                            if not p in delete_tags:
                                is_after_noun = True
                                break
                            le += len(t)
                            if le in starts:
                                break
                        break
                    ts = te

                if before and (is_before_noun == False or is_after_noun == False):
                    add_list.append(before) # (시작키값, 토큰)
                else:
                    remove_list.append((s, None)) # (시작키값, 모두 지우기 기호)
                    starts.remove(s)
                        
            for j, (token, _) in enumerate(likely_dict[s]):
                e = len(token) + s
                
                if e not in starts: # 수동모드에서는 -> 수동/모드에서는(에서는x)
                    after = None
                    is_after_noun = False
                    ts, te, tl = 0, 0, 0
                    for i, (t, p) in enumerate(pos_tags):
                        if ts == e: # 뒷 토큰(인덱스로 시작하는 토큰)
                            le = ts
                            for t, p in pos_tags[i:]:
                                if not p in delete_tags:
                                    is_after_noun = True
                                    break
                                le += len(t)
                                if le in starts:
                                    break
                            if is_after_noun == False:
                                after = (e, combo[e:le])
                            break
                        ts += len(t)

                    if after:
                        add_list.append(after) # (시작키값, 토큰)
                        ends.add(e)
                    else:
                        remove_list.append((s, j)) # (시작키값, 리스트인덱스)
                else:
                    ends.add(e)

        for s, idx in remove_list:
            if idx == None:
                del(likely_dict[s])
            else:
                likely_dict[s].pop(idx)
                if len(likely_dict[s]) == 0:
                    del(likely_dict[s])

        for s, token in add_list:
            if s in likely_dict.keys():
                likely_dict[s].append((token, 0))
            else:
                likely_dict[s] = [(token, 0)]

        if len(remove_list) == 0 and len(add_list) == 0:
            break
    else:
        raise Exception()

    return likely_dict

In [None]:
b, s = create_continuous_likely_dict(s, a)
p = okt.pos(s)
print(b)
fill_tokens(s, b, p)

{6: [('할', 4), ('할때', 3)], 2: [('에', 5), ('에서', 5)], 7: [('때', 3)], 4: [('출발', 7)], 8: [('떨림', 5)], 0: [('정지', 5)], 10: [('발생건', 0)]}


{0: [('정지', 5)],
 2: [('에서', 5)],
 4: [('출발', 7)],
 6: [('할', 4), ('할때', 3)],
 7: [('때', 3)],
 8: [('떨림', 5)],
 10: [('발생건', 0)]}

In [None]:
def create_combo_list(combo, likely_dict):
    len_combo = len(combo)
    max_combos_list = [(None, None)] * len_combo  # (최대점수, 그 경우 가능한 토큰조합들)

    def find_max_combos(idx=0):
        if max_combos_list[idx][0] == None:
            tokens = []
            max_score = -1
            for token, score in likely_dict[idx]:
                if idx + len(token) > len_combo: # 끝을 넘은 경우
                    print(f'except 16 : {combo}') # 앞의 likely_dict를 생성하면서 모두제거 되었다고 가정

                elif idx + len(token) < len_combo:
                    next_score = find_max_combos(idx + len(token))
                    if next_score < 0:
                        continue
                    score += next_score

                tokens.append((token, score))
                max_score = max(max_score, score)

            max_combos = []
            for token, score in tokens:
                if idx + len(token) == len_combo or len(max_combos_list[idx + len(token)][1]) == 0:
                    max_combos.append([(idx, token, score)])
                else:
                    for max_combo in max_combos_list[idx + len(token)][1]:
                        max_combos.append([(idx, token, score)] + max_combo)

            max_combos_list[idx] = (max_score, max_combos)
        
        return max_combos_list[idx][0]

    find_max_combos()
    return sorted(max_combos_list[0][1], key=lambda x: -len(x))

In [None]:
b, s = create_continuous_likely_dict(s, a)
c = fill_tokens(s, b, p)
create_combo_list(s, c)

[[(0, '정지', 29),
  (2, '에서', 24),
  (4, '출발', 19),
  (6, '할', 12),
  (7, '때', 8),
  (8, '떨림', 5),
  (10, '발생건', 0)],
 [(0, '정지', 29),
  (2, '에서', 24),
  (4, '출발', 19),
  (6, '할때', 8),
  (8, '떨림', 5),
  (10, '발생건', 0)]]

In [None]:
def tokenize_all_case(searcher, combo):
    '''
    TODO : 구글 서치를 바꿔야한다. (시동불가상태 -> 이동불가상태) 잘못된검색으로 변한다.
    즉, 일단 기존 검색으로 찾아보고 빈칸이 생기거나하면 추천검색으로 넘어가는식으로 바꿔야한다.
    '''
    if len(combo) == 1:
        return combo, [[(0, combo, -1)]], None, 'one_char'

    try:
        words, modifier = searcher.search_highlighted(combo)
    except:
        return None, 'search_failed', None, None
    if not words:   # 예외2 : 검색이 안됨
        return None, 'search_failed', None, None

    combo_search = re.sub('\s+', '', modifier) if modifier else combo

    words = [w for w in words if not bool(re.search('[a-zA-Z]', w))]
    likely_dict, new_combo = create_continuous_likely_dict(combo_search, words)

    if modifier:
        if combo_search != new_combo:
            print(f'mod_except : {combo} -> {new_combo} != {modifier}')
            return new_combo, 'mod_except', None, None
        pos_tags = okt.pos(modifier)
    else:
        pos_tags = okt.pos(new_combo)

    # TODO : 길이 7,8이상인면서 konlpy로 비명사로 중간에 잘리고, 검색으로도 잘렸을 경우 문장 분리해보기
    # ex) 차량지상으로견인조치 -> 차량지상으로 / 견인조치 -> 나눠서 다시 토큰나이즈 실행

    try:
        likely_dict = fill_tokens(new_combo, likely_dict, pos_tags)
        result = create_combo_list(new_combo, likely_dict)
        starts = sorted(likely_dict.keys())
    except:
        return new_combo, 'try_except', None, None

    return new_combo, result, pos_tags, starts

In [None]:
tokenize_all_case(google_searcher, '정지에서출발할때떨림발생건')

('정지에서출발할때떨림발생건',
 [[(0, '정지', 29),
   (2, '에서', 24),
   (4, '출발', 19),
   (6, '할', 12),
   (7, '때', 8),
   (8, '떨림', 5),
   (10, '발생건', 0)],
  [(0, '정지', 29),
   (2, '에서', 24),
   (4, '출발', 19),
   (6, '할때', 8),
   (8, '떨림', 5),
   (10, '발생건', 0)]],
 [('정지', 'Noun'),
  ('에서', 'Josa'),
  ('출발', 'Noun'),
  ('할', 'Verb'),
  ('때', 'Noun'),
  ('떨림', 'Verb'),
  ('발생', 'Noun'),
  ('건', 'Noun')],
 [0, 2, 4, 6, 7, 8, 10])

In [None]:
def extract_nouns(searcher, combo):
    new_combo, tokens_list, pos_tags, starts = tokenize_all_case(searcher, combo)

    if isinstance(tokens_list, str):
        return None, tokens_list
    elif starts == 'one_char':
        return new_combo, tokens_list

    probable_info = []

    pi, ps = 0, 0
    for s in starts:
        find_start = True
        for t, p in pos_tags[pi:]:
            pe = ps + len(t)
            if find_start:
                if s == ps:
                    find_start = False
                elif s < ps:
                    break
                pi += 1
            else:
                if p not in delete_tags:
                    break
                if ps in starts:
                    probable_info.append((ps, pe, t))
                    break
            ps = pe

    for tokens in tokens_list:
        pi = 0
        for i, (s, token, _) in enumerate(tokens):
            is_delete = False
            for ps, pe, pt in probable_info[pi:]:
                if s == ps:
                    is_delete = True
                    break
                elif s < ps:
                    if s + len(token) > ps:
                        is_delete = True
                    break
                elif s > ps:
                    if s < pe:
                        is_delete = True
                        break
                pi += 1
            if is_delete:
                tokens[i] = (s, None, None)

    return new_combo, tokens_list

In [None]:
extract_nouns(google_searcher, '거제시외버스터미널')

('거제시외버스터미널',
 [[(0, '거제', 20), (2, '시외', 15), (4, '버스', 10), (6, '터미널', 6)],
  [(0, '거제', 20), (2, '시외', 15), (4, '버스터미널', 4)],
  [(0, '거제', 20), (2, '시외버스', 7), (6, '터미널', 6)],
  [(0, '거제', 20), (2, '시외버스터미널', 2)],
  [(0, '거제시외버스터미널', 2)]])

## 토큰화 및 명사추출

### 파일열기 및 유틸 기능

In [None]:
with open('/content/drive/MyDrive/split_list.txt', 'r') as f:
    split_list = f.read().split()

print(len(split_list))

7955


In [None]:
delete_tags_kkma = ['V', 'J', 'M', 'E']

def split_only_noun_combo(data):
    combo_list = []
    only_non_noun_combo_list = []

    for combo in data:
        has_nouns_okt = False
        has_nouns_kkma = False

        tokens_okt = okt.pos(combo, norm=True)
        for token, p in tokens_okt:
            if p not in delete_tags:
                has_nouns_okt = True
                break

        tokens_kkma = kkma.pos(combo)
        for token, p in tokens_kkma:
            if p[0] not in delete_tags_kkma:
                has_nouns_kkma = True
                break

        if has_nouns_kkma == False or has_nouns_okt == False:
            for t in kkma_filter:
                if t in combo:
                    combo_list.append(combo)
                    break
            else:
                only_non_noun_combo_list.append((combo, tokens_kkma))
        else:
            combo_list.append(combo)


    return combo_list, only_non_noun_combo_list

In [None]:
combo_list, only_non_noun_combo_list = split_only_noun_combo(split_list)
print(len(combo_list))
print(len(only_non_noun_combo_list))

7101
854


In [None]:
def clean_nouns_list(nouns_list):
    result = []

    for src, trg, tokens_list in nouns_list:
        local_group = []
        si = 0
        reset = True
        group = []
        for ei, (e, token, _) in enumerate(tokens_list[0]):
            if reset:
                s = e
                si = ei
                reset = False
            if token != None:
                if ei == len(tokens_list[0]) - 1:
                    ei += 1
                    e = len(trg)
                else:
                    continue

            reset = True        
            local_group = [tokens_list[0][si:ei]]

            for tokens in tokens_list[1:]:
                lsi = None
                for lei, (ls, t, _) in enumerate(tokens):
                    if ls == s:
                        if t == None:
                            break
                        lsi = lei
                    if ls > e:
                        break
                    elif ls < e:
                        if lei == len(tokens) - 1:
                            lei += 1
                        else:
                            continue

                    if lsi == None:
                        break

                    local_tokens = tokens[lsi:lei]
                    
                    is_unique = True
                    for lt in local_group:
                        if lt == None:
                            is_unique =False
                            break
                        if len(lt) != len(local_tokens):
                            continue
                        for a, b in zip(lt, local_tokens):
                            if a[0] != b[0]:
                                break
                        else:   # for문을 모두 만족, 완벽히 일치하는게 이미 존재하는 경우
                            is_unique = False
                            break
                    
                    if is_unique and len(local_tokens) > 0:
                        local_group.append(local_tokens)

            if len(local_group[0]) > 0:
                group.append((src[s:e], trg[s:e], local_group))
        result.append((src, group))

    return result

In [None]:
from tqdm.auto import tqdm


def tokenize(combo_list, s, e):
    nouns_list = []
    zero_noun_list = []
    except_combo_list = []
    before_except_idx = 0
    
    for i, combo in enumerate(tqdm(combo_list[s:e]), start=s):
        new_combo, tokenized_result = extract_nouns(google_searcher, combo)
        if new_combo:
            nouns_list.append((combo, new_combo, tokenized_result))
        else:
            if tokenized_result == 'zero_noun': # 명사가 전혀 없는경우
                zero_noun_list.append(combo)
            elif tokenized_result == 'no_minimal_element': # 가장 잘게짜른 토큰화가 나머지 토큰화를 포함 못하는 경우
                except_combo_list.append(combo)
            elif tokenized_result == 'search_failed': # 크롤링 실패
                if before_except_idx +1 == i:
                    print(f'close : {combo}, end index : {i}')
                    e = i
                    break
                else:
                    before_except_idx = i
                    except_combo_list.append(combo)

        if i%1000 == 0 and i != s:
            data = {'nouns_list':nouns_list, 
                    'zero_noun_list':zero_noun_list,
                    'except_combo_list':except_combo_list}
            save(data, s, i)


    print(f'nouns_list length: {len(nouns_list)}')
    print(f'zero_noun_list length: {len(zero_noun_list)}')
    print(f'except_combo_list length: {len(except_combo_list)}')

    data = {'nouns_list':nouns_list, 
            'zero_noun_list':zero_noun_list,
            'except_combo_list':except_combo_list}

    return data, s, e

In [None]:
import pickle

def save(data, s, e, path='/content/drive/MyDrive/save/extracted'):
    with open(f'{path}_{len(combo_list)}_{s}_{e}.pickle', 'wb') as f:
        pickle.dump(data, f)

    nouns_list = clean_nouns_list(data['nouns_list'])

    with open(f'{path}_nouns_{len(combo_list)}_{s}_{e}.pickle', 'wb') as f:
        pickle.dump(nouns_list, f)

    return nouns_list

### 작업

In [None]:
s = 0
e = 100
data, s, e = tokenize(combo_list, s, e)
nouns_list = save(data, s, e)
print(s)
print(e)

  0%|          | 0/100 [00:00<?, ?it/s]

nouns_list length: 98
zero_noun_list length: 0
except_combo_list length: 2
0
100


In [None]:
nouns_list

[('천안정비사업소', [('천안정비사업소', '천안정비사업소', [[(0, '천안정비사업소', 11)]])]),
 ('포켓', [('포켓', '포켓', [[(0, '포켓', 7)]])]),
 ('블랙박스카드', [('블랙박스카드', '블랙박스카드', [[(0, '블랙박스', 30), (4, '카드', 15)]])]),
 ('차액', [('차액', '차액', [[(0, '차액', 6)]])]),
 ('단말기신호', [('단말기신호', '단말기신호', [[(0, '단말기', 21), (3, '신호', 13)]])]),
 ('사제', [('사제', '사제', [[(0, '사제', 9)]])]),
 ('담배냄새', [('담배냄새', '담배냄새', [[(0, '담배', 23), (2, '냄새', 12)]])]),
 ('진입', [('진입', '진입', [[(0, '진입', 15)]])]),
 ('범버', [('범버', '범버', [[(0, '범버', 12)]])]),
 ('카수리', [('카수리', '카수리', [[(0, '카수리', 7)]])]),
 ('연결선', [('연결선', '연결선', [[(0, '연결선', 16)]])]),
 ('시동확인완료',
  [('시동확인완료', '시동확인완료', [[(0, '시동', 35), (2, '확인', 23), (4, '완료', 12)]])]),
 ('퓨즈교환하니', [('퓨즈교환', '퓨즈교환', [[(0, '퓨즈', 25), (2, '교환', 12)]])]),
 ('주차완료',
  [('주차완료', '주차완료', [[(0, '주차', 13), (2, '완료', 6)], [(0, '주차완료', 1)]])]),
 ('앞유리에', [('앞유리', '앞유리', [[(0, '앞', 13), (1, '유리', 10)], [(0, '앞유리', 9)]])]),
 ('테스트주행시', [('테스트주행시', '테스트주행시', [[(0, '테스트', 9), (3, '주행시', 1)]])]),
 ('시도', [('시도', '시도', [[(0, 

In [None]:
import glob 

def merge_file_and_save(condition_path):
    s = ''

    for filepath in glob.glob(f'{condition_path}_*'):
        with open(filepath, 'r') as f:
            s += '\n' + f.read()

    with open(f'{condition_path}.txt', 'w') as f:
        f.write(s)

In [None]:
# merge_file_and_save('/content/drive/MyDrive/extracted_nouns_7251')
# merge_file_and_save('/content/drive/MyDrive/extracted_infos_7251')
# merge_file_and_save('/content/drive/MyDrive/extracted_except_7251')