# Full Flow
0. EDA
1. 파생변수 생성
2. 데이터 전처리
   - 특수문자 및 기호 등 필요없는 문자 제거
   - 대소문자 모두 소문자로 통일
   - 이름 | 불용어(분석에 필요없는 토큰) 제거 => TF-IDF 적용시 해결 가능, 이름 제거하는게 낫나 아니면 살리는게 낫나
   - 어근 추츨을 통한 텍스트 정규화 작업
   ----------------------------------
   - TF-IDF
   ---------------------------------
   - 벡터화 함수 적용
   ---------------------------------
   - 타켓 변수 불균형 해결
3. 모델 적합
  **CatBoost**
4. 하이퍼파라미터튜닝
  - optuna
5. 모델 앙상블

# CatBoost
Catboost는 이름에서도 유추 가능하듯이 Cat, Category 즉 범주형 변수가 많은 데이터를 학습할 때 성능이 좋음

https://julie-tech.tistory.com/119

# 라이브러리 및 데이터 불러오기

In [None]:
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# Catboost 모델링
from catboost import CatBoostClassifier, Pool

In [None]:
import nltk # 문장 토크나이저
nltk.download('all')

from nltk.corpus import names #corpus=말뭉치,이름 관련 부분 다루기 위한 객체

from nltk.corpus import stopwords # 영어 불용어 - 불용어 모아 놓은 리스트 다운로드해 제거

from nltk.tokenize import word_tokenize # 토큰화
from nltk.stem.porter import PorterStemmer # 어근 동일화 <-> 이거 말고도 "Lancaster Stemmer"

# 표제어 추출
from nltk.stem import WordNetLemmatizer

# 정규표현 처리
import re

In [None]:
# 구글 드라이브에서 데이터 불러오기
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import zipfile

# ZIP 파일 열기
with zipfile.ZipFile('/content/gdrive/MyDrive/0000/dacon_lawwinner/open.zip', 'r') as zip_ref:
    # 파일 목록 가져오기
    file_list = zip_ref.namelist()

    # 파일 목록 출력
    for file in file_list:
        print(file)

    # 모든 파일 압축 해제
    zip_ref.extractall()

sample_submission.csv
test.csv
train.csv


In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sample_submission.csv")

# 파생변수 생성

### 문장, 단어 길이 컬럼

In [None]:
train['sen_len'] = 0 # 문장 토큰화
train['word_len'] = 0 # 단어 토큰화

for i in range(len(train)):
  train.sen_len[i] = len(nltk.sent_tokenize(train.facts[i]))
  train.word_len[i] = len(word_tokenize(train.facts[i]))

train = train[['ID', 'first_party', 'second_party', 'facts', 'sen_len', 'word_len', 'first_party_winner']]
train.head(10)

Unnamed: 0,ID,first_party,second_party,facts,sen_len,word_len,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",7,201,1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,7,219,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,8,191,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,3,59,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",9,200,1
5,TRAIN_0005,"C & A Carbone, Inc., et al.",Town of Clarkstown,"A New York town, Clarkstown, allowed a contrac...",11,210,1
6,TRAIN_0006,"David Jennings, et al.","Alejandro Rodriguez, et al.",Sections of the Immigration and Nationality Ac...,7,264,1
7,TRAIN_0007,"US Airways, Inc.",Barnett,"In 1990, Robert Barnett injured his back while...",6,205,1
8,TRAIN_0008,"Ron Davis, Acting Warden",Hector Ayala,"Hector Ayala, a Hispanic man, was charged with...",12,478,1
9,TRAIN_0009,Paul A. McDaniel,"Selma Cash Paty, et al.","Since its first state Constitution in 1796, Te...",7,144,1


In [None]:
test['sen_len'] = 0 # 문장 토큰화
test['word_len'] = 0 # 단어 토큰화

for i in range(len(test)):
  test.sen_len[i] = len(nltk.sent_tokenize(test.facts[i]))
  test.word_len[i] = len(word_tokenize(test.facts[i]))

test = test[['ID', 'first_party', 'second_party', 'facts', 'sen_len', 'word_len']]
test.head(10)

Unnamed: 0,ID,first_party,second_party,facts,sen_len,word_len
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...,2,55
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...,7,209
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa...",7,181
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...,6,99
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a...",6,154
5,TEST_0005,Air Line Pilots Association,Miller,"The Air Line Pilots Association (ALPA), a priv...",9,163
6,TEST_0006,Traffic Stream (BVI) Infrastructure Ltd.,JPMorgan Chase Bank,Traffic Stream (BVI) Infrastructure Ltd. is a ...,5,177
7,TEST_0007,NRG Power Marketing LLC et al.,Maine Public Utilities Commission et al.,The Maine Public Utilities Commission along wi...,7,194
8,TEST_0008,United States,Deondery Chambers,Deondery Chambers pled guilty to being a felon...,8,243
9,TEST_0009,United States Catholic Conference,"Abortion Rights Mobilization, Inc.","Abortion Rights Mobilization, Inc. and a colle...",6,225


### issued_area

In [None]:
# 유형 분류 : 28개
df = pd.read_csv('https://s3.ap-northeast-2.amazonaws.com/data10902/messy/crime_clean.csv',encoding='utf-8')
df.소분류.unique()

In [None]:
# 캐글:샌프란시스코 범죄 유형 분류
# https://www.kaggle.com/c/sf-crime
# https://www.kaggle.com/code/yannisp/sf-crime-analysis-prediction

- Larceny/Theft
- Non/Criminal
- Assault

### 승률 현황
**first_party_win_percent**
  첫 번째 당사자가 승소할 확률.
- 출현 빈도가 2이상일때만 반영함. 1일때는 0.5
- test에서는 first_party가 train의 first_party와 겹치는 경우에는 해당 사람의 승소할 확률을 반영함. 겹치지 않는 경우는 0.5로 반영함.

In [None]:
# winner: 해당 사건에서 승소한 사람
# win_percentage: 승소한 사람이 다른 사건에서도 승소할 확률

train['winner']=0
for i in range(2477):
  if train['first_party_winner'][i]==1:
    train['winner'][i] = train['first_party'][i]
  else:
    train['winner'][i] = train['second_party'][i]

train['win_percentage']=0
for i in range(2477):
  winner_exp = len(train.loc[train['first_party'] == train['winner'][i]])
  loose_exp = len(train.loc[train['second_party'] == train['winner'][i]])
  train['win_percentage'][i]= len(train.loc[train['winner'] == train['winner'][i]]) /(winner_exp + loose_exp)

In [None]:
# first_party_frequency: first_party가 나타난 빈도
# first_party_win_percent: first_party가 승소할 확률
train['first_party_win_percent'] = 0
train['first_party_frequency'] = 0

for i in range(2477):
  train['first_party_frequency'][i] = len(train.loc[train['first_party'] == train['first_party'][i]]) + len(train.loc[train['second_party'] == train['first_party'][i]])

  # 출현 빈도가 2이상일때만 반영함. 1일때는 0.5
  if train['first_party_frequency'][i] > 1:
    p = len(train.loc[train['winner'] == train['first_party'][i]]) / train['first_party_frequency'][i]
    train['first_party_win_percent'][i] = p
  else:
    train['first_party_win_percent'][i] = 0.5

In [None]:
train = train.drop(['winner',	'win_percentage'],axis=1)
train.tail(30)

Unnamed: 0,ID,first_party,second_party,facts,sen_len,word_len,first_party_winner,first_party_win_percent,first_party_frequency
2448,TRAIN_2448,Pennsylvania,Delaware Valley Citizens' Council for Clean Air,"In April of 1973, pursuant to the Clean Air Ac...",13,394,1,0.625,8
2449,TRAIN_2449,WesternGeco LLC,ION Geophysical Corporation,The US Supreme Court issued a 2016 order grant...,9,254,1,0.5,1
2450,TRAIN_2450,Helling,McKinney,"William McKinney, a Nevada state prisoner, sue...",4,163,0,0.5,1
2451,TRAIN_2451,"Hana Financial, Inc.","Hana Bank, et al.","In the spring of 1994, Hana Bank, a Korean ent...",10,304,0,0.5,1
2452,TRAIN_2452,Police Department of the City of Chicago,Mosley,Chicago adopted an ordinance prohibiting picke...,3,54,0,0.5,1
2453,TRAIN_2453,City of Oklahoma City,"Rose Marie Tuttle, Individually and as Adminis...","On October 10, 1980, an Oklahoma City police o...",5,180,1,0.5,1
2454,TRAIN_2454,Rotella,Wood,Mark Rotella was admitted to a Brookhaven Psyc...,8,254,0,0.5,1
2455,TRAIN_2455,Gregory Houston Holt,"Ray Hobbs, Director, Arkansas Department of Co...",Gregory Holt (also known as Abdul Maalik Muham...,6,190,1,0.5,1
2456,TRAIN_2456,United States,Lanier,David W. Lanier was convicted under 18 U.S.C. ...,7,223,1,0.57868,394
2457,TRAIN_2457,Crosby,National Foreign Trade Council,"In 1996, the Massachusetts Burma Law, which re...",7,189,0,0.5,1


# Data Preprocessing

In [None]:
# 영어 데이터 전처리 함수
stops = set(stopwords.words('english'))
stemmer = nltk.stem.SnowballStemmer('english')

def cleaning(str):
    replaceAll = str

    # 특수문자 및 기호 등 필요없는 문자 제거
    only_english = re.sub('[^a-zA-Z]', ' ', replaceAll)

    # 대소문자 모두 소문자로 통일
    no_capitals = only_english.lower().split()

    # 이름, 불용어(분석에 필요없는 토큰) 제거
    all_names=set(names.words())
    no_stops = [word for word in no_capitals if not word in all_names]

    # 어근 추츨을 통한 텍스트 정규화 작업
    stemmer_words = [stemmer.stem(word) for word in no_stops]
    return ' '.join(stemmer_words)

In [None]:
vectorizer = TfidfVectorizer()

def get_vector(vectorizer, df, train_mode):
    # 데이터 클리닝
    df['fact_processing'] = df['facts'].apply(cleaning)

    # TF-IDF -- 파라미터 조정 필요?
    if train_mode:
        X_facts = vectorizer.fit_transform(df['fact_processing'])
    else:
        X_facts = vectorizer.transform(df['fact_processing'])

    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])

    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

In [None]:
# 벡터화 함수 적용
X = get_vector(vectorizer, train, True)
Y = train["first_party_winner"]
X_train = np.asarray(X) # np.array 활용 및 적용
Y_train = np.asarray(Y)

X_test = get_vector(vectorizer, test, False)

# Define Model & Train

## self weight
전체 데이터에서 1:0=66:33 인 비율로, 이를 해결하기 위해 클래스별 가중치를 계산해 이를 고려해 학습 시도.
따라서 y값 데이터의 불균형 해결

+ 다른 방법으로 해결하고 싶다면
+ +) 타겟 변수 불균형 문제가 있어 클래스별 가중치를 적용해 모델 학습을 진행

In [None]:
counts = list(Y.value_counts())
class_weight = [counts[1]/sum(counts), counts[0]/sum(counts)]
print("weight :", class_weight)

weight : [0.33454398708635996, 0.66545601291364]


## 모델 적합

In [None]:
model = CatBoostClassifier(random_seed=42,class_weights=class_weight, verbose=0)
model.fit(X_train, Y_train)

<catboost.core.CatBoostClassifier at 0x7f7759f6bfa0>

## 파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

# defining parameter range
param_grid = {
    'n_estimators': [100, 200,300.400,500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

grid = GridSearchCV(CatBoostClassifier(), param_grid, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X_train, Y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100;, score=nan total time=  43.7s
[CV 2/5] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100;, score=nan total time=  44.9s
[CV 3/5] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100;, score=nan total time=  38.6s
[CV 4/5] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100;, score=nan total time=  40.3s
[CV 5/5] END criterion=gini, max_depth=4, max_features=auto, n_estimators=100;, score=nan total time=  40.8s
[CV 1/5] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=nan total time=  46.1s
[CV 2/5] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=nan total time=  39.4s
[CV 3/5] END criterion=gini, max_depth=4, max_features=auto, n_estimators=200;, score=nan total time=  40.9s
[CV 4/5] END criterion=gini, max_depth=4, max_features=auto, n_es

# Inference & Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
X_test = np.asarray(X_test)
Y_pred = model.predict(X_test)

In [None]:
submit['first_party_winner'] = Y_pred
submit.to_csv('./submit_catboost2.csv', index=False)
print('Done')

Done
