# Full Flow
0. *EDA*
1. *파생변수 생성*  


---

-   수치형 변수 (sen\_len, word\_len, winning\_percent)
-   범주형 변수 (first\_party, second\_party, issued\_area)

---


**2\. 데이터 전처리**

**3\. 모델 적합**  


# 라이브러리 및 데이터 불러오기

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# Catboost 모델링
from catboost import CatBoostClassifier, Pool

In [3]:
import nltk # 문장 토크나이저
nltk.download('all')

from nltk.corpus import names #corpus=말뭉치,이름 관련 부분 다루기 위한 객체

from nltk.corpus import stopwords # 영어 불용어 - 불용어 모아 놓은 리스트 다운로드해 제거

from nltk.tokenize import word_tokenize # 토큰화
from nltk.stem.porter import PorterStemmer # 어근 동일화 <-> 이거 말고도 "Lancaster Stemmer"

# 표제어 추출
from nltk.stem import WordNetLemmatizer

# 정규표현 처리
import re

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

In [4]:
# cleaning 함수 적용
# https://dacon.io/competitions/official/236112/codeshare/8463?page=1&dtype=recent
# https://gist.github.com/nealrs/96342d8231b75cf4bb82
contractions = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

In [5]:
# 영어 데이터 전처리 함수
stops = set(stopwords.words('english'))
ps = nltk.stem.porter.PorterStemmer()
all_names=set(names.words())
lem = nltk.stem.wordnet.WordNetLemmatizer()

def cleaning(str):
    replaceAll = str

    # 특수문자 및 기호 등 필요없는 문자 제거
    words = replaceAll.split()
    only_english = ''
    for word in words:
      if word in ['<p1>','<p2>']:
        only_english = only_english + word + ' '
      else:
        only_english += re.sub(r"[^a-zA-Z]", ' ', word)
        only_english += ' '  # 띄어쓰기 추가
    only_english = only_english.strip()
    ## 쌍따옴표 제거
    only_english = re.sub('"', ' ', only_english)
    ## 영문 내의 문자 변환 제거
    only_english = ' '.join([contractions[t] if t in contractions else t for t in only_english.split(" ")]) #약어 정규화
    only_english = re.sub(r"'s\b"," ",only_english) #소유격 제거


    # 대소문자 모두 소문자로 통일
    no_capitals = only_english.lower().split()

    # 이름, 불용어(분석에 필요없는 토큰) 제거
    all_names = set(names.words())
    no_stops = [word for word in no_capitals if not word in all_names|stops]

    # 표제어 : 단어의 원형 형태를 나타내며, 명사의 경우 복수형이나 동사의 경우 시제 등을 고려하여 변환
    lem_text = [lem.lemmatize(word, pos='v') for word in no_stops]

    # 어근 추츨 : 단어의 형태를 보존하는 특징이 있지만 추출된 어근이 실제로는 사전에 존재하지 않은 단어일 수 있음
    stemmer_words = [ps.stem(word) for word in lem_text]

    # back to string from list
    text = " ".join(stemmer_words)

    return text

In [6]:
# 구글 드라이브에서 데이터 불러오기
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [7]:
import zipfile

# ZIP 파일 열기
with zipfile.ZipFile('/content/gdrive/MyDrive/0000/dacon_lawwinner/open.zip', 'r') as zip_ref:
    # 파일 목록 가져오기
    file_list = zip_ref.namelist()

    # 파일 목록 출력
    for file in file_list:
        print(file)

    # 모든 파일 압축 해제
    zip_ref.extractall()

sample_submission.csv
test.csv
train.csv


In [47]:
# 전처리 하지 않은 raw 데이터셋
train = pd.read_csv("train.csv")
test = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sample_submission.csv")

# keywords_list 데이터
keywords_list = pd.read_csv('/content/gdrive/MyDrive/0000/dacon_lawwinner/keywords_list.csv')

# 파생변수 생성

### sentence / word Len

##### train

In [48]:
train['sen_len'] = 0 # 문장 토큰화
train['word_len'] = 0 # 단어 토큰화

for i in range(len(train)):
  train.sen_len[i] = len(nltk.sent_tokenize(train.facts[i]))
  train.word_len[i] = len(word_tokenize(train.facts[i]))

train = train[['ID', 'first_party', 'second_party', 'facts', 'sen_len', 'word_len', 'first_party_winner']]
train.head(10)

Unnamed: 0,ID,first_party,second_party,facts,sen_len,word_len,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",7,201,1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,7,219,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,8,191,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,3,59,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",9,200,1
5,TRAIN_0005,"C & A Carbone, Inc., et al.",Town of Clarkstown,"A New York town, Clarkstown, allowed a contrac...",11,210,1
6,TRAIN_0006,"David Jennings, et al.","Alejandro Rodriguez, et al.",Sections of the Immigration and Nationality Ac...,7,264,1
7,TRAIN_0007,"US Airways, Inc.",Barnett,"In 1990, Robert Barnett injured his back while...",6,205,1
8,TRAIN_0008,"Ron Davis, Acting Warden",Hector Ayala,"Hector Ayala, a Hispanic man, was charged with...",12,478,1
9,TRAIN_0009,Paul A. McDaniel,"Selma Cash Paty, et al.","Since its first state Constitution in 1796, Te...",7,144,1


##### test

In [49]:
test['sen_len'] = 0 # 문장 토큰화
test['word_len'] = 0 # 단어 토큰화

for i in range(len(test)):
  test.sen_len[i] = len(nltk.sent_tokenize(test.facts[i]))
  test.word_len[i] = len(word_tokenize(test.facts[i]))

test = test[['ID', 'first_party', 'second_party', 'facts', 'sen_len', 'word_len']]
test.head(10)

Unnamed: 0,ID,first_party,second_party,facts,sen_len,word_len
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...,2,55
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...,7,209
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa...",7,181
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...,6,99
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a...",6,154
5,TEST_0005,Air Line Pilots Association,Miller,"The Air Line Pilots Association (ALPA), a priv...",9,163
6,TEST_0006,Traffic Stream (BVI) Infrastructure Ltd.,JPMorgan Chase Bank,Traffic Stream (BVI) Infrastructure Ltd. is a ...,5,177
7,TEST_0007,NRG Power Marketing LLC et al.,Maine Public Utilities Commission et al.,The Maine Public Utilities Commission along wi...,7,194
8,TEST_0008,United States,Deondery Chambers,Deondery Chambers pled guilty to being a felon...,8,243
9,TEST_0009,United States Catholic Conference,"Abortion Rights Mobilization, Inc.","Abortion Rights Mobilization, Inc. and a colle...",6,225


### issued_area

In [50]:
keywords_list = keywords_list.values.tolist()
train['keywords_list'] = keywords_list

#### 판결 유형 대분류표 생성

In [51]:
# 영어 데이터 전처리 함수
stops = set(stopwords.words('english'))
ps = nltk.stem.porter.PorterStemmer()
all_names=set(names.words())
lem = nltk.stem.wordnet.WordNetLemmatizer()

def cleaning(str):
    replaceAll = str

    # 특수문자 및 기호 등 필요없는 문자 제거
    words = replaceAll.split()
    only_english = ''
    for word in words:
      if word in ['<p1>','<p2>']:
        only_english = only_english + word + ' '
      else:
        only_english += re.sub(r"[^a-zA-Z]", ' ', word)
        only_english += ' '  # 띄어쓰기 추가
    only_english = only_english.strip()

    # 대소문자 모두 소문자로 통일
    no_capitals = only_english.lower().split()

    # 이름, 불용어(분석에 필요없는 토큰) 제거
    all_names=set(names.words())
    no_stops = [word for word in no_capitals if not word in all_names|stops]

    # 표제어 : 단어의 원형 형태를 나타내며, 명사의 경우 복수형이나 동사의 경우 시제 등을 고려하여 변환
    lem_text = [lem.lemmatize(word, pos='v') for word in no_stops]

    # 어근 추츨 : 단어의 형태를 보존하는 특징이 있지만 추출된 어근이 실제로는 사전에 존재하지 않은 단어일 수 있음
    stemmer_words = [ps.stem(word) for word in lem_text]

    # back to string from list
    text = " ".join(stemmer_words)

    return text

In [52]:
## 형사
criminal = ['murder', 'robbery', 'rape', 'theft','larceny', 'criminal','assault',
'drug', 'traffic', 'spy', 'espionage', 'marijuana', 'rape', 'burglarious']

## 민사
civil = ['lawsuits', 'contract violations',
'divorce', 'child custody', 'inheritance',
'labor unions' , 'wage claims', 'terminations',
'defamation','Slander', 'libel', 'reputation','bankruptcy', 'fraud','kidnap'
]

## 헌법
constitutional = ['Constitutional Petitions',
'constitutional disputes',
'individual constitutional rights']

df = pd.DataFrame({
    'main_words':criminal + civil + constitutional,
    'category' : len(criminal)*["criminal"]+len(civil)*["civil"]+len(constitutional)*["constitutional"]
})

df['main_procssing']= df['main_words'].apply(cleaning)
df

Unnamed: 0,main_words,category,main_procssing
0,murder,criminal,murder
1,robbery,criminal,robberi
2,rape,criminal,rape
3,theft,criminal,theft
4,larceny,criminal,larceni
5,criminal,criminal,crimin
6,assault,criminal,assault
7,drug,criminal,drug
8,traffic,criminal,traffic
9,spy,criminal,spi


#### train : 최종 category 변수 생성

In [53]:
def finding_cateory(lst):
    k = 0
    for item in lst:
      if item in df.main_procssing.values:
        k = df.loc[df['main_procssing'] == item, 'category'].values[0]
      # 값이 저장되었다면 for 구문 중단
      if k != 0:
        break
    # k 값이 없다면 기타
    if k == 0:
      k = 'others'
    return k

In [54]:
train['category'] = 0

for i in range(len(train)):
  train['category'][i] = finding_cateory(keywords_list[i])

In [55]:
train = train[['ID', 'first_party', 'second_party', 'facts','category', 'sen_len', 'word_len', 'first_party_winner'  ]]
train = train.rename(columns={'category':'issued_area'})
train

Unnamed: 0,ID,first_party,second_party,facts,issued_area,sen_len,word_len,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",others,7,201,1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,criminal,7,219,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,criminal,8,191,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,others,3,59,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",others,9,200,1
...,...,...,...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,others,5,144,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",others,7,184,1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",criminal,6,195,0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",others,8,194,0


#### test

In [56]:
test['category'] = 0

for i in range(len(test)):
  test['category'][i] = finding_cateory(keywords_list[i])

In [57]:
test = test[['ID', 'first_party', 'second_party', 'facts','category', 'sen_len', 'word_len']]
test = test.rename(columns={'category':'issued_area'})
test

Unnamed: 0,ID,first_party,second_party,facts,issued_area,sen_len,word_len
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...,others,2,55
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...,criminal,7,209
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa...",criminal,7,181
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...,others,6,99
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a...",others,6,154
...,...,...,...,...,...,...,...
1235,TEST_1235,"Haitian Centers Council, Inc., et al.","Chris Sale, Acting Commissioner, Immigration A...",According to Executive Order No. 12807 signed ...,criminal,5,156
1236,TEST_1236,Whitman,"American Trucking Associations, Inc.",Section 109(a) of the Clean Air Act (CAA) requ...,criminal,7,221
1237,TEST_1237,Linda A. Matteo and John J. Madigan,William G. Barr,Linda Matteo and John Madigan created a plan f...,others,12,236
1238,TEST_1238,Washington State Apple Advertising Commission,Hunt,"In 1972, the North Carolina Board of Agricultu...",others,3,84


### winning_percent
**first_party_win_percent**
  첫 번째 당사자가 승소할 확률.
- 출현 빈도가 2이상일때만 반영함. 1일때는 0.5
- test에서는 first_party가 train의 first_party와 겹치는 경우에는 해당 사람의 승소할 확률을 반영함. 겹치지 않는 경우는 0.5로 반영함.

##### train

In [58]:
# winner: 해당 사건에서 승소한 사람
# win_percentage: 승소한 사람이 다른 사건에서도 승소할 확률

train['winner']=0
for i in range(2478):
  if train['first_party_winner'][i]==1:
    train['winner'][i] = train['first_party'][i]
  else:
    train['winner'][i] = train['second_party'][i]

train['win_percentage']=0
for i in range(2478):
  winner_exp = len(train.loc[train['first_party'] == train['winner'][i]])
  loose_exp = len(train.loc[train['second_party'] == train['winner'][i]])
  train['win_percentage'][i]= len(train.loc[train['winner'] == train['winner'][i]]) /(winner_exp + loose_exp)

In [59]:
# first_party_frequency: first_party가 나타난 빈도
# first_party_win_percent: first_party가 승소할 확률
train['first_party_win_percent'] = 0
train['first_party_frequency'] = 0

for i in range(2478):
  train['first_party_frequency'][i] = len(train.loc[train['first_party'] == train['first_party'][i]]) + len(train.loc[train['second_party'] == train['first_party'][i]])

  # 출현 빈도가 2이상일때만 반영함. 1일때는 0.5
  if train['first_party_frequency'][i] > 1:
    p = len(train.loc[train['winner'] == train['first_party'][i]]) / train['first_party_frequency'][i]
    train['first_party_win_percent'][i] = p
  else:
    train['first_party_win_percent'][i] = 0.5

##### test

In [60]:
# train 기준 컬럼 생성
train_first_party = train[['first_party','first_party_win_percent']]
train_first_party = train_first_party.drop_duplicates()
train_first_party.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
2470    False
2473    False
2474    False
2475    False
2477    False
Length: 2110, dtype: bool

In [61]:
# 병합
test_percent = pd.merge(test.first_party, train_first_party, how='left')
test_percent = test_percent.fillna(0.5)

# test에 새로운 컬럼 추가
test['first_party_win_percent'] = test_percent['first_party_win_percent']

##### category

In [62]:
def categorize_probability(x):
    if x == 0:
        return 0
    elif 0 < x <= 0.33:
        return 1
    elif 0.33 < x < 0.5:
        return 2
    elif x == 0.5:
        return 3
    elif 0.5 < x < 0.57:
        return 2
    elif 0.57 <= x < 0.75:
        return 4
    elif 0.75 <= x < 1:
        return 5
    elif x == 1:
        return 6
    else:
        return -1

In [63]:
def categorize_probability2(x):
    if 0 <= x < 0.5:
        return 1
    elif x == 0.5:
        return 2
    elif 0.5 < x <= 1:
        return 3

In [64]:
# 함수적용
train['winning_percent'] = train['first_party_win_percent'].apply(categorize_probability2)
test['winning_percent'] = test['first_party_win_percent'].apply(categorize_probability2)

### party 분류
Defining Entity type for each party
- https://github.com/smitp415/CSCI_544_Final_Project

In [65]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [66]:
def data_distribution_graph(df, cols):
    for feature in cols:
        fig, ax = plt.subplots()
        percent = df.groupby(feature).size() / df[feature].count() * 100
        count = df.groupby(feature).size()

        if feature == 'facts_len':
          count = pd.cut(df['facts_len'], [1,50,500,1000,1500,3000,7000], include_lowest=True)
          percent = df.groupby(count).size() / df[feature].count() * 100
          count = df.groupby(count).size()
          ax = percent.plot(kind='barh', figsize=(10,10), xticks = range(0,101,5), fontsize=12)
        elif feature == 'term':
          ax = percent.plot(kind='barh', figsize=(10,20), xticks = range(0,101,5), fontsize=12)
        else:
          ax = percent.plot(kind='barh', figsize=(10,10), xticks = range(0,101,5), fontsize=12)

        # print count and percentage on grapp
        for i, v in enumerate(zip(percent.values, count.values)):
            percent = '{v}%'.format(v = round(v[0],2))
            ax.text(v[0] + 2, i - .25, percent, color='black', fontweight='bold', fontsize=12)
            ax.text(v[0] + 15, i - .25, str(v[1]), color='blue', fontweight='bold', fontsize=12)

        ax.set_title('Data Distribution - {feature}'.format(feature= feature), fontsize=20)
        ax.set_xlabel("Percentage", fontsize=20)
        ax.set_ylabel(feature, fontsize=20)

#### train

In [67]:
for idx, value in train.iterrows():
  first = value['first_party']
  second = value['second_party']
  doc = nlp(first)
  for ent in doc.ents:
    train.loc[idx, 'first_party_ner'] = ent.label_
  doc = nlp(second)
  for ent in doc.ents:
    train.loc[idx, 'second_party_ner'] = ent.label_

#### test

In [68]:
for idx, value in test.iterrows():
  first = value['first_party']
  second = value['second_party']
  doc = nlp(first)
  for ent in doc.ents:
    test.loc[idx, 'first_party_ner'] = ent.label_
  doc = nlp(second)
  for ent in doc.ents:
    test.loc[idx, 'second_party_ner'] = ent.label_

#### 카테고리 통합

person / org / gpe / others ( 나머지 카테고리 포함 + 결측값 )

In [69]:
train.first_party_ner.value_counts(), train.second_party_ner.value_counts(),test.first_party_ner.value_counts(), test.second_party_ner.value_counts()

(PERSON         851
 ORG            759
 GPE            498
 NORP            11
 CARDINAL         9
 LOC              4
 PRODUCT          3
 LAW              2
 MONEY            2
 WORK_OF_ART      2
 LANGUAGE         1
 DATE             1
 FAC              1
 Name: first_party_ner, dtype: int64,
 PERSON         790
 ORG            695
 GPE            658
 NORP            18
 LOC              7
 CARDINAL         6
 WORK_OF_ART      4
 FAC              2
 PRODUCT          2
 DATE             1
 LAW              1
 Name: second_party_ner, dtype: int64,
 PERSON      417
 ORG         366
 GPE         279
 NORP         10
 LAW           3
 CARDINAL      2
 LOC           2
 DATE          1
 Name: first_party_ner, dtype: int64,
 PERSON      417
 ORG         366
 GPE         279
 NORP         10
 LAW           3
 LOC           2
 CARDINAL      2
 DATE          1
 Name: second_party_ner, dtype: int64)

In [70]:
train.isnull().sum()

ID                           0
first_party                  0
second_party                 0
facts                        0
issued_area                  0
sen_len                      0
word_len                     0
first_party_winner           0
winner                       0
win_percentage               0
first_party_win_percent      0
first_party_frequency        0
winning_percent              0
first_party_ner            334
second_party_ner           294
dtype: int64

In [71]:
df33 = train[['ID', 'first_party', 'second_party' , 'first_party_ner',
       'second_party_ner', 'facts', 'issued_area',
       'sen_len', 'word_len', 'win_percentage','first_party_winner']]

df33

Unnamed: 0,ID,first_party,second_party,first_party_ner,second_party_ner,facts,issued_area,sen_len,word_len,win_percentage,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,PERSON,PERSON,"On June 27, 1962, Phil St. Amant, a candidate ...",others,7,201,1.000000,1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,PERSON,PERSON,Ramon Nelson was riding his bike when he suffe...,criminal,7,219,1.000000,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",PERSON,GPE,An Alabama state court convicted Billy Joe Mag...,criminal,8,191,1.000000,1
3,TRAIN_0003,Linkletter,Walker,,,Victor Linkletter was convicted in state court...,others,3,59,0.333333,0
4,TRAIN_0004,William Earl Fikes,Alabama,PERSON,GPE,"On April 24, 1953 in Selma, Alabama, an intrud...",others,9,200,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",ORG,ORG,Congress amended the Clean Air Act through the...,others,5,144,1.000000,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.",PERSON,ORG,"Alliance Bond Fund, Inc., an investment fund, ...",others,7,184,1.000000,1
2475,TRAIN_2475,Peguero,United States,ORG,GPE,"In 1992, the District Court sentenced Manuel D...",criminal,6,195,0.578680,0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,ORG,,"On March 8, 1996, Enrico St. Cyr, a lawful per...",others,8,194,1.000000,0


통합 !

In [72]:
my_list = ['NORP', 'CARDINAL', 'LOC', 'PRODUCT', 'LAW', 'MONEY',
           'WORK_OF_ART', 'LANGUAGE', 'DATE', 'FAC']

# 다른 카테고리 모두 others로 통합
## train
train.loc[train['first_party_ner'].isin(my_list), 'first_party_ner'] = 'others'
train.loc[train['second_party_ner'].isin(my_list), 'second_party_ner'] = 'others'

## test
test.loc[test['first_party_ner'].isin(my_list), 'first_party_ner'] = 'others'
test.loc[test['second_party_ner'].isin(my_list), 'second_party_ner'] = 'others'

In [73]:
# 결측값을 'others'로 치환
train.fillna('others', inplace=True)
test.fillna('others', inplace=True)

In [74]:
train.first_party_ner.value_counts(), train.second_party_ner.value_counts(),test.first_party_ner.value_counts(), test.second_party_ner.value_counts()

(PERSON    851
 ORG       759
 GPE       498
 others    370
 Name: first_party_ner, dtype: int64,
 PERSON    790
 ORG       695
 GPE       658
 others    335
 Name: second_party_ner, dtype: int64,
 PERSON    417
 ORG       366
 GPE       279
 others    178
 Name: first_party_ner, dtype: int64,
 PERSON    417
 ORG       366
 GPE       279
 others    178
 Name: second_party_ner, dtype: int64)

### party name 치환

In [75]:
# 이름 치환함수
def replace_name(name, text, replace_word):
    names = name.split('|')
    words = text.split()
    for word in words:
      for n in names:
        if n in word:
            text = text.replace(word, replace_word)

    return text

# 정규표현식으로 first_party 여러번 중복 연달아 나오는 거 한 번만 나오게 바꾸기
def remove_duplicates(text):
    modified_text1 = re.sub(r'(<p1>\s*)+', '<p1> ', text)
    modified_text2 = re.sub(r'(<p2>\s*)+', '<p2> ', modified_text1)

    return modified_text2

#### train

In [76]:
# 데이터 손실 방지를 위해 값 복사
train['first_party1'] = train['first_party'].str.strip()
train['second_party1'] = train['second_party'].str.strip()
train['facts1'] = train['facts']

# 이름 치환을 위한 base
train['first_party1'] = train['first_party1'].str.replace(" ", "|")
train['second_party1'] = train['second_party1'].str.replace(" ", "|")

In [77]:
# 함수 각각 적용
for i in range(len(train)):
  train['facts1'][i] = replace_name(train['first_party1'][i], train['facts1'][i], '<p1>')
  train['facts1'][i] = replace_name(train['second_party1'][i], train['facts1'][i], '<p2>')
  train['facts1'][i] = remove_duplicates(train['facts1'][i])

#### test

In [79]:
# 데이터 손실 방지를 위해 값 복사
test['first_party1'] = test['first_party'].str.strip()
test['second_party1'] = test['second_party'].str.strip()
test['facts1'] = test['facts']

# 이름 치환을 위한 base
test['first_party1'] = test['first_party1'].str.replace(" ", "|")
test['second_party1'] = test['second_party1'].str.replace(" ", "|")

In [80]:
# 함수 각각 적용
for i in range(len(test)):
  test['facts1'][i] = replace_name(test['first_party1'][i], test['facts1'][i], '<p1>')
  test['facts1'][i] = replace_name(test['second_party1'][i], test['facts1'][i], '<p2>')
  test['facts1'][i] = remove_duplicates(test['facts1'][i])

In [81]:
test

Unnamed: 0,ID,first_party,second_party,facts,issued_area,sen_len,word_len,first_party_win_percent,winning_percent,second_party_ner,first_party_ner,first_party1,second_party1,facts1
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...,others,2,55,0.5,2,GPE,others,Salerno,United|States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...,criminal,7,209,0.5,2,ORG,PERSON,Milberg|Weiss|Bershad|Hynes|and|Lerach,"Lexecon,|Inc.",Lexecon <p2> was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa...",criminal,7,181,0.5,2,ORG,ORG,No.|07-582\t|Title:|\t|Federal|Communications|...,"Fox|Television|Stations,|Inc.,|et|al.","In 2002 and 2003, <p2> Stations broadcast the ..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...,others,6,99,0.5,2,GPE,PERSON,Harold|Kaufman,United|States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a...",others,6,154,0.5,2,others,PERSON,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1235,TEST_1235,"Haitian Centers Council, Inc., et al.","Chris Sale, Acting Commissioner, Immigration A...",According to Executive Order No. 12807 signed ...,criminal,5,156,0.5,2,ORG,ORG,"Haitian|Centers|Council,|Inc.,|et|al.","Chris|Sale,|Acting|Commissioner,|Immigration|A...",According to Executive Order No. 12807 signed ...
1236,TEST_1236,Whitman,"American Trucking Associations, Inc.",Section 109(a) of the Clean Air Act (CAA) requ...,criminal,7,221,0.5,2,ORG,others,Whitman,"American|Trucking|Associations,|Inc.",Section 109(a) of the Clean Air Act (CAA) requ...
1237,TEST_1237,Linda A. Matteo and John J. Madigan,William G. Barr,Linda Matteo and John Madigan created a plan f...,others,12,236,0.5,2,PERSON,PERSON,Linda|A.|Matteo|and|John|J.|Madigan,William|G.|Barr,<p1> created a plan for utilizing $2.6 million...
1238,TEST_1238,Washington State Apple Advertising Commission,Hunt,"In 1972, the North Carolina Board of Agricultu...",others,3,84,0.5,2,PERSON,ORG,Washington|State|Apple|Advertising|Commission,Hunt,"In 1972, the North Carolina Board of Agricultu..."


## df 확인

In [82]:
train_df = train[['ID',
                  'first_party_ner', 'second_party_ner',
                  'facts1',
                  'issued_area',
                 'sen_len', 'word_len',
                  'winning_percent',
                  'first_party_winner']]
test_df = test[['ID',
                  'first_party_ner', 'second_party_ner',
                  'facts1',
                  'issued_area',
                 'sen_len', 'word_len',
                  'winning_percent']]

train_df = train_df.rename(columns={"first_party_ner":"first_party",
                                    "second_party_ner":"second_party",
                                    'facts1':"fact",
                                    "first_party_win_percent":"winning_percent"
                                    })
test_df = test_df.rename(columns={"first_party_ner":"first_party",
                                    "second_party_ner":"second_party",
                                    'facts1':"fact",
                                    "first_party_win_percent":"winning_percent"
                                    })

# feature1, feature2를 범주형 변수로 변경
train_df['first_party'] = train_df['first_party'].astype('category')
train_df['second_party'] = train_df['second_party'].astype('category')
train_df['issued_area'] = train_df['issued_area'].astype('category')
train_df['winning_percent'] = train_df['winning_percent'].astype('category')

test_df['first_party'] = test_df['first_party'].astype('category')
test_df['second_party'] = test_df['second_party'].astype('category')
test_df['issued_area'] = test_df['issued_area'].astype('category')

In [89]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1240 entries, 0 to 1239
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   ID               1240 non-null   object  
 1   first_party      1240 non-null   category
 2   second_party     1240 non-null   category
 3   fact             1240 non-null   object  
 4   issued_area      1240 non-null   category
 5   sen_len          1240 non-null   int64   
 6   word_len         1240 non-null   int64   
 7   winning_percent  1240 non-null   int64   
dtypes: category(3), int64(3), object(2)
memory usage: 52.7+ KB


In [83]:
train_df

Unnamed: 0,ID,first_party,second_party,fact,issued_area,sen_len,word_len,winning_percent,first_party_winner
0,TRAIN_0000,PERSON,PERSON,"On June 27, 1962, <p1> a candidate for public ...",others,7,201,2,1
1,TRAIN_0001,PERSON,PERSON,Ramon Nelson was riding his bike when he suffe...,criminal,7,219,2,0
2,TRAIN_0002,PERSON,GPE,An Alabama state court convicted <p1> of murde...,criminal,8,191,2,1
3,TRAIN_0003,others,others,Victor <p1> was convicted in state court on ev...,others,3,59,2,0
4,TRAIN_0004,PERSON,GPE,"On April 24, 1953 in Selma, <p2> an intruder b...",others,9,200,2,1
...,...,...,...,...,...,...,...,...,...
2473,TRAIN_2473,ORG,ORG,Congress amended the Clean Air Act through the...,others,5,144,2,1
2474,TRAIN_2474,PERSON,ORG,"<p2> an investment fund, purchased approximate...",others,7,184,2,1
2475,TRAIN_2475,ORG,GPE,"In 1992, the District Court sentenced Manuel D...",criminal,6,195,2,0
2476,TRAIN_2476,ORG,others,"On March 8, 1996, Enrico <p2> a lawful permane...",others,8,194,3,0


In [84]:
test_df

Unnamed: 0,ID,first_party,second_party,fact,issued_area,sen_len,word_len,winning_percent
0,TEST_0000,others,GPE,The 1984 Bail Reform Act allowed the federal c...,others,2,55,2
1,TEST_0001,PERSON,ORG,Lexecon <p2> was a defendant in a class action...,criminal,7,209,2
2,TEST_0002,ORG,ORG,"In 2002 and 2003, <p2> Stations broadcast the ...",criminal,7,181,2
3,TEST_0003,PERSON,GPE,During his trial for armed robbery of a federa...,others,6,99,2
4,TEST_0004,PERSON,others,"In 1993, a magistrate judge issued a warrant a...",others,6,154,2
...,...,...,...,...,...,...,...,...
1235,TEST_1235,ORG,ORG,According to Executive Order No. 12807 signed ...,criminal,5,156,2
1236,TEST_1236,others,ORG,Section 109(a) of the Clean Air Act (CAA) requ...,criminal,7,221,2
1237,TEST_1237,PERSON,PERSON,<p1> created a plan for utilizing $2.6 million...,others,12,236,2
1238,TEST_1238,ORG,PERSON,"In 1972, the North Carolina Board of Agricultu...",others,3,84,2
