In [1]:
import os
import random
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from imblearn.under_sampling import NeighbourhoodCleaningRule

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [2]:
# Fix Seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(1004)

In [3]:
# Load Data
train_df = pd.read_csv('../Data/train.csv').drop('ID', axis=1)
test_df = pd.read_csv('../Data/test.csv').drop('ID', axis=1)
submission_df = pd.read_csv('../Data/sample_submission.csv')

In [4]:
train_df.head(5)

Unnamed: 0,first_party,second_party,facts,first_party_winner
0,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1


In [16]:
fp_sample = train_df['first_party'][0]
sp_sample = train_df['second_party'][0]
print(fp_sample, ',', sp_sample)

Phil A. St. Amant , Herman A. Thompson


In [17]:
cols = ['first_party', 'second_party', 'facts']
shortword = re.compile(r'\W*\b\w{1}\b') # 한글자 단어 제거
tokenizer = TreebankWordTokenizer() # Treebank Tokenizer: 단어 단위로 토큰화
stopword = stopwords.words('english') # NLTK 불용어 사전
lemmatizer = WordNetLemmatizer() # NLTK 표제어 추출

In [54]:
def party_list(df, party_column):
    party_list = []
    for party in df[party_column]:
        party = re.sub(shortword, '', party)
        party = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", party)
        token = tokenizer.tokenize(party)
        party_token = [lemmatizer.lemmatize(word) for word in token if word not in stopword]
        # 토큰으로 반환하면 party list로 만들 수 있지 않을까?
        party_list.append(' '.join(party_token))
    return party_list

In [70]:
def word_cleaning(df, column):
    processed_list = []
    for sen in df[column]:
        sen = re.sub(shortword, '', sen)
        sen = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", sen)
        token = tokenizer.tokenize(sen)
        sen_token = [lemmatizer.lemmatize(word) for word in token if word not in stopword]
        processed_list.append(' '.join(sen_token))
    return processed_list

In [71]:
fp_train = word_cleaning(train_df, 'first_party')
sp_train = word_cleaning(train_df, 'second_party')
facts_train = word_cleaning(train_df, 'facts')

fp_test = word_cleaning(test_df, 'first_party')
sp_test = word_cleaning(test_df, 'second_party')
facts_test = word_cleaning(test_df, 'facts')

In [80]:
def fit_vectorizer(fp, sp, facts):
    word_vec = CountVectorizer(ngram_range=(1, 2))
    facts_vec = TfidfVectorizer(ngram_range=(1, 2))
    
    word_vec.fit(fp + sp)
    facts_vec.fit(facts)
    
    return word_vec, facts_vec

def transform_vectorizer(fp, sp, facts, word_vec, facts_vec):
    X1 = word_vec.transform(fp).toarray()
    X2 = word_vec.transform(sp).toarray()
    X3 = facts_vec.transform(facts).toarray()
    
    return np.concatenate((X1, X2, X3), axis=1)

def word_vectorizer(fp_train, sp_train, facts_train, fp_test, sp_test, facts_test):
    word_vec, facts_vec = fit_vectorizer(fp_train, sp_train, facts_train)
    
    X_train = transform_vectorizer(fp_train, sp_train, facts_train, word_vec, facts_vec)
    X_test = transform_vectorizer(fp_test, sp_test, facts_test, word_vec, facts_vec)
    
    return X_train, X_test

In [82]:
X_train, X_test = word_vectorizer(fp_train, sp_train, facts_train, fp_test, sp_test, facts_test)
y_train = train_df['first_party_winner']

In [85]:
print(X_train.shape, y_train.shape)
print(X_test.shape)
print(y_train.value_counts())

(2478, 204261) (2478,)
(1240, 204261)
first_party_winner
1    1649
0     829
Name: count, dtype: int64


In [86]:
# 데이터 불균형 문제 전처리(언더샘플링)
X_nc, y_nc = NeighbourhoodCleaningRule(n_neighbors=3).fit_resample(X_train, y_train)
print('Train Data Shape after UnderSampling')
print(X_nc.shape, y_nc.shape)
print('='*20)
print('Train target after UnderSampling')
print(y_nc.value_counts())

Train Data Shape after UnderSampling
(1639, 204261) (1639,)
Train target after UnderSampling
first_party_winner
0    829
1    810
Name: count, dtype: int64


In [87]:
# Train, Validation 분리
Train_X, Val_X, Train_y, Val_y = train_test_split(X_nc, y_nc, test_size=.25, random_state=42, stratify=y_nc)
print('Train Data Shape')
print(Train_X.shape, Train_y.shape)
print('-'*20)
print('Train target')
print(Train_y.value_counts())
print('='*20)
print('Validation Data Shape')
print(Val_X.shape, Val_y.shape)
print('-'*20)
print('Validation target')
print(Val_y.value_counts())

Train Data Shape
(1229, 204261) (1229,)
--------------------
Train target
first_party_winner
0    622
1    607
Name: count, dtype: int64
Validation Data Shape
(410, 204261) (410,)
--------------------
Validation target
first_party_winner
0    207
1    203
Name: count, dtype: int64


In [88]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(Train_X, Train_y)
print(classification_report(Val_y, Logistic.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.68      0.71      0.69       207
           1       0.69      0.65      0.67       203

    accuracy                           0.68       410
   macro avg       0.68      0.68      0.68       410
weighted avg       0.68      0.68      0.68       410



In [91]:
submission_df['first_party_winner'] = Logistic.predict(X_test)
submission_df.to_csv('logi___2.csv', index=False)