In [59]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [61]:
import pandas as pd
import numpy as np

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# Catboost 모델링
from catboost import CatBoostClassifier, Pool

In [44]:
import nltk # 문장 토크나이저
#nltk.download('punkt')

# 영어 불용어 - 불용어 모아 놓은 리스트 다운로드해 제거
nltk.download('all')
# nltk.download('wordnet')
# nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize # 토큰화
from nltk.stem.porter import PorterStemmer # 어근 동일화 <-> 이거 말고도 "Lancaster Stemmer"

# 표제어 추출
from nltk.stem import WordNetLemmatizer

# 정규표현 처리
import re

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

In [45]:
# 구글 드라이브에서 데이터 불러오기
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [46]:
import zipfile

# ZIP 파일 열기
with zipfile.ZipFile('/content/gdrive/MyDrive/0000/dacon_lawwinner/open.zip', 'r') as zip_ref:
    # 파일 목록 가져오기
    file_list = zip_ref.namelist()

    # 파일 목록 출력
    for file in file_list:
        print(file)

    # 모든 파일 압축 해제
    zip_ref.extractall()

sample_submission.csv
test.csv
train.csv


In [47]:
train = pd.read_csv("train.csv")
test = pd.read_csv("/content/test.csv")
sample_submission = pd.read_csv("/content/sample_submission.csv")

# Data Preprocessing

In [48]:
# 영어 데이터 전처리 함수
stops = set(stopwords.words('english'))
stemmer = nltk.stem.SnowballStemmer('english')

def cleaning(str):
    replaceAll = str

    # 특수문자 및 기호 등 필요없는 문자 제거
    only_english = re.sub('[^a-zA-Z]', ' ', replaceAll)

    # 대소문자 모두 소문자로 통일
    no_capitals = only_english.lower().split()

    # 불용어(분석에 필요없는 토큰) 제거
#    no_stops = [word for word in no_capitals if not word in stops]

    # 어근 추츨을 통한 텍스트 정규화 작업
#    stemmer_words = [stemmer.stem(word) for word in no_stops]
    stemmer_words = [stemmer.stem(word) for word in no_capitals]
    return ' '.join(stemmer_words)

In [49]:
vectorizer = TfidfVectorizer()

def get_vector(vectorizer, df, train_mode):
    # 데이터 클리닝
    df['fact_processing'] = df['facts'].apply(cleaning)

    # TF-IDF
    if train_mode:
        X_facts = vectorizer.fit_transform(df['fact_processing'])
    else:
        X_facts = vectorizer.transform(df['fact_processing'])

    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])

    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

In [50]:
# 벡터화 함수 적용
X = get_vector(vectorizer, train, True)
Y = train["first_party_winner"]
X_train = np.asarray(X) # np.array 활용 및 적용
Y_train = np.asarray(Y)


X_test = get_vector(vectorizer, test, False)

# Define Model & Train

## self weight
전체 데이터에서 1:0=66:33 인 비율로, 이를 해결하기 위해 클래스별 가중치를 계산해 이를 고려해 학습 시도.
따라서 y값 데이터의 불균형 해결

In [55]:
counts = list(Y.value_counts())
class_weight = [counts[1]/sum(counts), counts[0]/sum(counts)]
print("weight :", class_weight)

weight : [0.33454398708635996, 0.66545601291364]


## 모델 적합

In [62]:
model = CatBoostClassifier(random_seed=42,class_weights=class_weight, verbose=0)
model.fit(X_train, Y_train)

<catboost.core.CatBoostClassifier at 0x7f730e12bcd0>

# Inference & Submission

In [63]:
submit = pd.read_csv('./sample_submission.csv')

In [64]:
X_test = np.asarray(X_test)
Y_pred = model.predict(X_test)

In [65]:
submit['first_party_winner'] = Y_pred
submit.to_csv('./baseline_submit.csv', index=False)
print('Done')

Done
