### Loading

In [1]:
import os
import random
import numpy as np
import pandas as pd
import re

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from imblearn.under_sampling import NeighbourhoodCleaningRule

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/yunho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yunho/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Fix Seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(1004)

In [4]:
# Load Data
train_df = pd.read_csv('../Data/train.csv').drop('ID', axis=1)
test_df = pd.read_csv('../Data/test.csv').drop('ID', axis=1)
submission_df = pd.read_csv('../Data/sample_submission.csv')

In [5]:
train_df.head(5)

Unnamed: 0,first_party,second_party,facts,first_party_winner
0,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1


In [6]:
aug_df = pd.DataFrame({'first_party': train_df['second_party'],
                       'second_party': train_df['first_party'],
                       'facts': train_df['facts'],
                       'first_party_winner': 1-train_df['first_party_winner']})

train_df = pd.concat([train_df, aug_df], ignore_index=True)
train_df

Unnamed: 0,first_party,second_party,facts,first_party_winner
0,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...
4951,"Renewable Fuels Association, et al.","HollyFrontier Cheyenne Refining, LLC, et al.",Congress amended the Clean Air Act through the...,0
4952,"Alliance Bond Fund, Inc.","Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc., an investment fund, ...",0
4953,United States,Peguero,"In 1992, the District Court sentenced Manuel D...",1
4954,St. Cyr,Immigration and Naturalization Service,"On March 8, 1996, Enrico St. Cyr, a lawful per...",1


In [7]:
fp_sample = train_df['first_party'][0]
sp_sample = train_df['second_party'][0]
print(fp_sample, ',', sp_sample)

Phil A. St. Amant , Herman A. Thompson


### Preprocessing

In [8]:
shortword = re.compile(r'\W*\b\w{1}\b') # 한글자 단어 제거
tokenizer = TreebankWordTokenizer() # Treebank Tokenizer: 단어 단위로 토큰화
stopword = stopwords.words('english') # NLTK 불용어 사전
lemmatizer = WordNetLemmatizer() # NLTK 표제어 추출

In [9]:
# 본 함수는 사용하지 않지만, 나중에 party list를 만드는데 유용할 것 같아서 남겨둠
def party_list(df, party_column):
    party_list = []
    for party in df[party_column]:
        party = re.sub(shortword, '', party)
        party = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", party)
        token = tokenizer.tokenize(party)
        party_token = [lemmatizer.lemmatize(word) for word in token if word not in stopword]
        # 토큰으로 반환하면 party list로 만들 수 있지 않을까?
        party_list.append(' '.join(party_token))
    return party_list

In [10]:
def word_cleaning(df, column):
    processed_list = []
    for sen in df[column]:
        sen = re.sub(shortword, '', sen)
        sen = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", sen)
        token = tokenizer.tokenize(sen)
        sen_token = [lemmatizer.lemmatize(word) for word in token if word not in stopword]
        processed_list.append(' '.join(sen_token))
    return processed_list

In [11]:
fp_train = word_cleaning(train_df, 'first_party')
sp_train = word_cleaning(train_df, 'second_party')
facts_train = word_cleaning(train_df, 'facts')

fp_test = word_cleaning(test_df, 'first_party')
sp_test = word_cleaning(test_df, 'second_party')
facts_test = word_cleaning(test_df, 'facts')

In [12]:
def fit_vectorizer(fp, sp, facts):
    word_vec = CountVectorizer(ngram_range=(1, 2))
    facts_vec = TfidfVectorizer(ngram_range=(1, 2))
    
    word_vec.fit(fp + sp)
    facts_vec.fit(facts)
    
    return word_vec, facts_vec

def transform_vectorizer(fp, sp, facts, word_vec, facts_vec):
    X1 = word_vec.transform(fp).toarray()
    X2 = word_vec.transform(sp).toarray()
    X3 = facts_vec.transform(facts).toarray()
    
    return np.concatenate((X1, X2, X3), axis=1)

def word_vectorizer(fp_train, sp_train, facts_train, fp_test, sp_test, facts_test):
    word_vec, facts_vec = fit_vectorizer(fp_train, sp_train, facts_train)
    
    X_train = transform_vectorizer(fp_train, sp_train, facts_train, word_vec, facts_vec)
    X_test = transform_vectorizer(fp_test, sp_test, facts_test, word_vec, facts_vec)
    
    return X_train, X_test

In [13]:
X_train, X_test = word_vectorizer(fp_train, sp_train, facts_train, fp_test, sp_test, facts_test)
y_train = train_df['first_party_winner']

In [14]:
print(X_train.shape, y_train.shape)
print(X_test.shape)
print(y_train.value_counts())

(4956, 207490) (4956,)
(1240, 207490)
first_party_winner
1    2478
0    2478
Name: count, dtype: int64


### Augmentated Outcome

In [15]:
# Train, Validation 분리
Train_X, Val_X, Train_y, Val_y = train_test_split(X_train, y_train, test_size=.25, random_state=1004, stratify=y_train)
print('Train Data Shape')
print(Train_X.shape, Train_y.shape)
print('-'*20)
print('Train target')
print(Train_y.value_counts())
print('='*20)
print('Validation Data Shape')
print(Val_X.shape, Val_y.shape)
print('-'*20)
print('Validation target')
print(Val_y.value_counts())

Train Data Shape
(3717, 207490) (3717,)
--------------------
Train target
first_party_winner
1    1859
0    1858
Name: count, dtype: int64
Validation Data Shape
(1239, 207490) (1239,)
--------------------
Validation target
first_party_winner
0    620
1    619
Name: count, dtype: int64


In [16]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(Train_X, Train_y)
print(classification_report(Val_y, Logistic.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.42      0.42      0.42       620
           1       0.42      0.42      0.42       619

    accuracy                           0.42      1239
   macro avg       0.42      0.42      0.42      1239
weighted avg       0.42      0.42      0.42      1239



In [17]:
submission_df['first_party_winner'] = Logistic.predict(X_test)
submission_df.to_csv('logi___3.csv', index=False)

### Undersampling

In [18]:
# 데이터 불균형 문제 전처리(언더샘플링)
X_nc, y_nc = NeighbourhoodCleaningRule(n_neighbors=3).fit_resample(X_train, y_train)
print('Train Data Shape after UnderSampling')
print(X_nc.shape, y_nc.shape)
print('='*20)
print('Train target after UnderSampling')
print(y_nc.value_counts())

Train Data Shape after UnderSampling
(3595, 207490) (3595,)
Train target after UnderSampling
first_party_winner
0    2478
1    1117
Name: count, dtype: int64


In [19]:
# Train, Validation 분리
Train_X, Val_X, Train_y, Val_y = train_test_split(X_nc, y_nc, test_size=.25, random_state=42, stratify=y_nc)
print('Train Data Shape')
print(Train_X.shape, Train_y.shape)
print('-'*20)
print('Train target')
print(Train_y.value_counts())
print('='*20)
print('Validation Data Shape')
print(Val_X.shape, Val_y.shape)
print('-'*20)
print('Validation target')
print(Val_y.value_counts())

Train Data Shape
(2696, 207490) (2696,)
--------------------
Train target
first_party_winner
0    1858
1     838
Name: count, dtype: int64
Validation Data Shape
(899, 207490) (899,)
--------------------
Validation target
first_party_winner
0    620
1    279
Name: count, dtype: int64


In [20]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(Train_X, Train_y)
print(classification_report(Val_y, Logistic.predict(Val_X)))

              precision    recall  f1-score   support

           0       0.73      0.90      0.81       620
           1       0.54      0.28      0.37       279

    accuracy                           0.70       899
   macro avg       0.64      0.59      0.59       899
weighted avg       0.67      0.70      0.67       899



In [21]:
submission_df['first_party_winner'] = Logistic.predict(X_test)
submission_df.to_csv('logi___2.csv', index=False)