In [1]:
import spacy
import pandas as pd
import numpy as np
import re

from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
from spacy.language import Language
from spacy.tokens.span import Span

In [2]:
nlp = spacy.load("en_core_web_lg") # large 모델

In [3]:
train_df = pd.read_csv('../Data/train.csv').drop('ID', axis=1)
test_df = pd.read_csv('../Data/test.csv').drop('ID', axis=1)
submission_df = pd.read_csv('../Data/sample_submission.csv')

In [4]:
@Language.component("no_possesive") # no_possesive라는 이름의 컴포넌트를 정의

def no_possesive(doc):
    doc.ents = _no_possesive_generator(doc) # _no_possesive_generator 함수를 통해 doc.ents를 업데이트
    return doc

def _no_possesive_generator(doc):
    """Yields non possessive versions of the given document's entities."""
    for ent in doc.ents:
        if ent.text.endswith("'s") or ent.text.endswith("’s"): # 's, ’s로 끝나는 경우
            yield Span(doc, ent.start, ent.end-1, label=ent.label) # 끝에 있는 's, ’s를 제거
        else:
            yield ent

# spacy pipeline에 추가
nlp.add_pipe("no_possesive")

<function __main__.no_possesive(doc)>

In [5]:
def remove_words_from_span(span, indexes):
    """
    spacy nlp document로부터 특정 index의 token을 제거한 span을 반환하는 함수
    """
    nlp_list = np.array(span) # span을 np.array로 변환
    nlp_list = np.delete(nlp_list, indexes).tolist() # indexes에 해당하는 단어를 제거
    return nlp(" ".join([e.text for e in nlp_list])) # 제거한 단어를 제외한 나머지 단어를 join

    
def extract_dots_pos_nlp(nlp_words, search_pattern="\."):
    """
    entity에 포함시키기 어려운 단어(st. Mr. 등)을 제거하는 함수
    """
    pos = []
    pattern = re.compile(search_pattern) # search_pattern에 해당하는 패턴을 정규표현식으로 컴파일
    for i, word in enumerate(nlp_words):
        if bool(pattern.search(word.text)): # search_pattern에 해당하는 단어가 있는 경우
            pos.append(i) # 해당 단어의 인덱스를 pos에 추가
    return pos

def merge_phrases(matches, doc_len):
    """
    연속된 구간의 pharse를 리스트의 인덱스로 찾아서 하나의 entity로 만드는 함수
    """
    def consecutive(data, stepsize=1): # 연속된 숫자를 찾는 함수
        return np.split(data, np.where(np.diff(data) != stepsize)[0]+1) # 연속된 숫자를 찾아서 split

    match_mask = np.zeros(doc_len) # 문장의 길이만큼 0으로 채워진 배열 생성
    for match_id, start, end in matches:
        match_mask[start:end+1] = 1 # match되는 부분을 1로 변경
    new_matches = consecutive(np.where(match_mask==1)[0]) # 연속된 숫자를 찾아서 split
    new_matches = [[idx, match[0], match[-1]] for idx, match in enumerate(new_matches)] # 연속된 숫자의 시작과 끝을 저장
    return new_matches

def extract_named_entities(doc, entity_types=["PERSON", "ORG", "GPE"]):
    """
    nlp document로부터 Named Entity를 추출하는 함수
    """
    entities = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] # entity의 text, label, id를 저장
    target_entities = []
    for ent in entities: # entity_types에 해당하는 entity를 저장
        if ent[1] in ["PERSON", "ORG", "GPE"]:
            target_entities.append(ent[0])
    target_entities = np.unique(target_entities) # 중복된 entity를 제거
    target_entities = target_entities.tolist() 
    target_entities = [nlp.make_doc(t) for t in target_entities] # entity를 nlp document로 변환
    return target_entities

In [6]:
# constants
titles = ["first_party", "second_party"]

def get_matched_entities(fact, first, second, verbose=False):
    doc = fact # spacy nlp document

    # Named Entity를 추출    
    entities = extract_named_entities(doc)
    if verbose:
        print(f"entities:\n{entities}")

    # Named Entity를 Phrase Matcher에 추가
    terms_segmented_list = {t: [] for t in titles}
    terms = [first, second]
    for title, term in zip(titles, terms):
        new_term = term.replace("et al.", "").replace("et al", "").strip() # et al 제거
        new_term = nlp.make_doc(new_term) # term을 nlp document로 변환

        terms_segmented_list[title].append(new_term)

        pos_list = extract_dots_pos_nlp(new_term) # 마침표 포함된 토큰의 위치 추출
        new_term = remove_words_from_span(new_term, pos_list) # term에서 st. Mr. 등을 제거
        terms_segmented_list[title].append(new_term)

        # term을 단어 단위로 분리
        for i in range(1, len(new_term)):
            terms_segmented_list[title].append(new_term[:i])
            terms_segmented_list[title].append(new_term[-i:])
    if verbose:
        for title in titles:
            print(f"phrases({title}): {terms_segmented_list[title]}")
    
    # Phrase Matcher를 통해 Named Entity를 문장에서 추출
    thresholds = [0.95, 0.8, 0.6] # 탐색 phrase의 길이에 따라 다른 threhold를 적용 시도 : 한 단어짜리 phrase는 tight하게 similarity를 비교
    targets = {t: [] for t in titles}

    for cur_ent in entities:
        for title, terms in terms_segmented_list.items():
            skip_flag = False
            for term in terms:
                if len(term) == 1: # 단어가 1개인 경우
                    thres = thresholds[0]
                elif len(term) < 4: # 단어가 4개 미만인 경우
                    thres = thresholds[1]
                else: # 단어가 4개 이상인 경우
                    thres = thresholds[2]
                sim = cur_ent.similarity(term) # 두 단어의 유사도 계산
                if verbose: 
                    print(f"\t{cur_ent} vs {term} : {sim}")
                if sim > thres: # 유사도가 thres보다 큰 경우
                    targets[title].append(cur_ent) # targets에 추가
                    skip_flag = True # skip_flag를 True로 변경
                    break
            if skip_flag:
                continue
    if verbose:
        for title in titles:
            print(f"filtered({title}): {targets[title]}")
    
    # 문장에서의 Named Entity 위치 찾기
    result = {t: None for t in titles}
    for title, data in targets.items():
        matcher = PhraseMatcher(nlp.vocab)
        matcher.add("TerminologyList", data)

        matches = matcher(doc)
        if len(matches) == 0:
            if verbose:
                print(f"{title} no match found..")
            continue
        matches = merge_phrases(matches, len(doc))
        result[title] = matches

    return result

In [7]:

def replace_entities(fact_nlp, matches_dict:dict, replaces={"first_party": "[FIRST_PARTY] ", "second_party": "[SECOND_PARTY] "}):
    """
    문장에서 Named Entity를 대체하는 함수
    """
    new_fact = []
    total_len = len(fact_nlp)

    def replace_match(cur_idx, start, end, match_idx, match_info, replace_str):
        if cur_idx == start:
            new_fact.append(replace_str)
        elif cur_idx == end:
            match_idx += 1
            if match_idx < len(match_info):
                _, start, end = match_info[match_idx]
            else:
                # finish matching
                start, end = total_len, total_len
        else: # skip
            pass
        return start, end, match_idx

    match_idx_first = 0
    match_idx_second = 0
    matches_first = matches_dict["first_party"]
    matches_second = matches_dict["second_party"]

    if matches_first is not None:
        _, start_first, end_first = matches_first[match_idx_first] # init
    else:
        start_first, end_first = total_len, total_len

    if matches_second is not None:
        _, start_second, end_second = matches_second[match_idx_second] # init
    else:
        start_second, end_second = total_len, total_len

    for idx in range(total_len):
        if idx >= start_first and idx <= end_first:
            start_first, end_first, match_idx_first = replace_match(idx, start_first, end_first, match_idx_first, matches_first, replaces["first_party"])
        elif idx >= start_second and idx <= end_second:
            start_second, end_second, match_idx_second = replace_match(idx, start_second, end_second, match_idx_second, matches_second, replaces["second_party"])
        else:
            new_fact.append(fact_nlp[idx].text_with_ws)

    new_fact = "".join(new_fact).strip()

    return new_fact

In [8]:
# pandas row마다 적용할 최종 wrapper function

replaces={"first_party": "[FIRST_PARTY] ", "second_party": "[SECOND_PARTY] "}

def df_row_process(row):
    first_party = row["first_party"]
    second_party = row["second_party"]

    doc = row["facts_nlp"]
    match_result = get_matched_entities(doc, first_party, second_party, verbose=False)
    new_first = replaces["first_party"].strip() if match_result["first_party"] is not None else first_party
    new_second = replaces["second_party"].strip() if match_result["second_party"] is not None else second_party
    new_fact = replace_entities(doc, match_result, replaces)

    return new_fact, new_first, new_second

In [9]:
# spacy nlp로 변환
train_df["facts_nlp"] = train_df['facts'].apply(lambda x : nlp(x))
train_df[["new_facts", "first_party", "second_party"]] = train_df.apply(df_row_process, axis=1, result_type="expand")

  sim = cur_ent.similarity(term) # 두 단어의 유사도 계산


In [10]:
train_df

Unnamed: 0,first_party,second_party,facts,first_party_winner,facts_nlp,new_facts
0,[FIRST_PARTY],[SECOND_PARTY],"On June 27, 1962, Phil St. Amant, a candidate ...",1,"(On, June, 27, ,, 1962, ,, Phil, St., Amant, ,...","On June 27, 1962, [FIRST_PARTY] a candidate fo..."
1,Stephen Duncan,[SECOND_PARTY],Ramon Nelson was riding his bike when he suffe...,0,"(Ramon, Nelson, was, riding, his, bike, when, ...",Ramon Nelson was riding his bike when he suffe...
2,[FIRST_PARTY],"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1,"(An, Alabama, state, court, convicted, Billy, ...",An Alabama state court convicted [FIRST_PARTY]...
3,Linkletter,Walker,Victor Linkletter was convicted in state court...,0,"(Victor, Linkletter, was, convicted, in, state...",Victor Linkletter was convicted in state court...
4,[FIRST_PARTY],[SECOND_PARTY],"On April 24, 1953 in Selma, Alabama, an intrud...",1,"(On, April, 24, ,, 1953, in, Selma, ,, Alabama...","On April 24, 1953 in Selma, [SECOND_PARTY] an ..."
...,...,...,...,...,...,...
2473,"HollyFrontier Cheyenne Refining, LLC, et al.",[SECOND_PARTY],Congress amended the Clean Air Act through the...,1,"(Congress, amended, the, Clean, Air, Act, thro...",Congress amended the Clean Air Act through the...
2474,[FIRST_PARTY],[SECOND_PARTY],"Alliance Bond Fund, Inc., an investment fund, ...",1,"(Alliance, Bond, Fund, ,, Inc., ,, an, investm...","[SECOND_PARTY] an investment fund, purchased a..."
2475,[FIRST_PARTY],United States,"In 1992, the District Court sentenced Manuel D...",0,"(In, 1992, ,, the, District, Court, sentenced,...","In 1992, the District Court sentenced Manuel D..."
2476,[FIRST_PARTY],[SECOND_PARTY],"On March 8, 1996, Enrico St. Cyr, a lawful per...",0,"(On, March, 8, ,, 1996, ,, Enrico, St., Cyr, ,...","On March 8, 1996, [SECOND_PARTY] a lawful perm..."


In [15]:
train_df_1 = pd.read_csv('../Data/train.csv').drop('ID', axis=1)


In [18]:
idx = 1234
print(train_df_1['first_party'][idx])
print(train_df_1['second_party'][idx])
print(train_df['facts'][idx])
print(train_df['new_facts'][idx])

Carol Anne Bond
United States
Carol Anne Bond was found guilty of trying to poison her husband's mistress, Myrlinda Haynes, with toxic chemicals at least 24 times over the course of several months. A grand jury in the Eastern District of Pennsylvania charged Bond with two counts of possessing and using a chemical weapon, in violation of a criminal statute implementing the treaty obligations of the United States under the 1993 Chemical Weapons Convention. The grand jury also charged Bond with two counts of mail theft. Bond's attorneys argue that the statute was intended to deal with rogue states and terrorists and that their client should have been prosecuted under state law instead. Bond, a laboratory technician, stole the chemical potassium dichromate from the company where she worked. Haynes was not injured. Bond's husband had a child with Haynes while married to Bond. Haynes had contacted police and postal authorities after finding the chemicals at her home. In September 2009, the U