# 데이터 전처리

In [0]:
import pandas as pd
import numpy as np
import re
import json

In [0]:
DATA_IN_PATH = r'/data_in/'

TRAIN_INPUT_DATA = 'train.csv'

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_INPUT_DATA)

# 두 라벨의 개수의 균형 맞추기

In [0]:
train_pos_data = train_data.loc[train_data['is_duplicate'] == 1] # 중복된 질문
train_neg_data = train_data.loc[train_data['is_duplicate'] == 0] # 중복이 아닌 질문

class_difference = len(train_neg_data) - len(train_pos_data) # 두 라벨 개수의 차

# 샘플링 하기 위해 적은 데이터(중복 질문)의 개수가 많은 데이터(중복이 아닌 질문)에 대한 비율을 계산한다
sample_frac = 1 - (class_difference / len(train_neg_data))
train_neg_data = train_neg_data.sample(frac=sample_frac)

In [0]:
print('중복 질문 개수 : {}'.format(len(train_pos_data)))
print('중복이 아닌 질문 개수 : {}'.format(len(train_neg_data)))

중복 질문 개수 : 149263
중복이 아닌 질문 개수 : 149263


In [0]:
# 라벨에 따라 나눠진 데이터를 다시 합친다
train_data = pd.concat([train_neg_data, train_pos_data])

In [0]:
FILTERS = "([~.,!?\"':;)(])" # 제거하고자 하는 기호를 정규 표현식으로 나타낸 문자열
change_filter = re.compile(FILTERS) # 패턴 객체 생성

question1 = [str(s) for s in train_data['question1']]
question2 = [str(s) for s in train_data['question2']] #question2 = [s for s in train_data['question2'].apply(str)]

filtered_question1s = list()
filtered_question2s = list()

for q in question1:
    # 앞서 정의한 필터에 해당하는 문자열을 제거하고 모든 알파벳 문자를 소문자로 만든다
    filtered_question1s.append(re.sub(change_filter, "", q).lower())

for q in question2:
    filtered_question2s.append(re.sub(change_filter, "", q).lower())

In [0]:
from nltk.corpus import stopwords

def remove_stopword(question, remove_stopwords=False):
    words = question.split()
    
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        clean_review = ' '.join(words)
    
    else:
        clean_review = ' '.join(words)
    return clean_review

In [0]:
filtered_question1 = []
for q in filtered_question1s:
    filtered_question1.append(remove_stopword(q, remove_stopwords = True))
    
filtered_question2 = []
for q in filtered_question2s:
    filtered_question2.append(remove_stopword(q, remove_stopwords = True))

In [0]:
filtered_question1[:10]

['dispose e-waste india',
 'study class 11th biology',
 'way hold prenup postnup agreements legal indian court law',
 'makes election democratic',
 'handwriting tell person',
 'important truthfulness',
 'find owner google sheet',
 'deal arrogant coworker',
 'difference high mortality rate low mortality rate',
 'brit know canada visiting first time']

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(filtered_question1 + filtered_question2)
word_vocab = tokenizer.word_index

In [0]:
question1_sequence = tokenizer.texts_to_sequences(filtered_question1)
question2_sequence = tokenizer.texts_to_sequences(filtered_question2)

In [0]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 31

q1_data = pad_sequences(question1_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
q2_data = pad_sequences(question2_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [0]:
labels = np.array(train_data['is_duplicate'], dtype=int)

print('Shape of question1 data: {}'.format(q1_data.shape))
print('Shape of question2 data: {}'.format(q2_data.shape))
print('Shape of label: {}'.format(labels.shape))
print('Words in index: {}'.format(len(word_vocab)))

Shape of question1 data: (298526, 31)
Shape of question2 data: (298526, 31)
Shape of label: (298526,)
Words in index: 76323


In [0]:
data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab) + 1

In [0]:
TRAIN_Q1_DATA = 'q1_train.npy'
TRAIN_Q2_DATA = 'q2_train.npy'
TRAIN_LABEL_DATA = 'label_train.npy'
DATA_CONFIGS = 'data_configs.npy'

np.save(open(DATA_IN_PATH + TRAIN_Q1_DATA, 'wb'), q1_data)
np.save(open(DATA_IN_PATH + TRAIN_Q2_DATA, 'wb'), q2_data)
np.save(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'wb'), labels)

json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'))

# 평가 데이터 전처리

In [0]:
test_data = pd.read_csv(DATA_IN_PATH + 'test.csv')
#valid_ids = [type(x) == int for x in test_data['test_id']]
#test_data = test_data[valid_ids].drop_duplicates() ... drop_duplcates : 중복 값 제거
#test_data['test_id'].duplicated().sum()

In [0]:
test_questions1 = [str(s) for s in test_data['question1']]
test_questions2 = [str(s) for s in test_data['question2']]

filtered_test_questions1s = list()
filtered_test_questions2s = list()

for q in test_questions1:
    filtered_test_questions1s.append(re.sub(change_filter, "", q).lower())

for q in test_questions2:
    filtered_test_questions2s.append(re.sub(change_filter, "", q).lower())

In [0]:
filtered_test_question1 = []
for q in filtered_test_questions1s:
    filtered_test_question1.append(remove_stopword(q, remove_stopwords = True))
    
filtered_test_question2 = []
for q in filtered_test_questions2s:
    filtered_test_question2.append(remove_stopword(q, remove_stopwords = True))

In [0]:
test_questions1_sequence = tokenizer.texts_to_sequences(filtered_test_question1)
test_questions2_sequence = tokenizer.texts_to_sequences(filtered_test_question2)

test_q1_data = pad_sequences(test_questions1_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_q2_data = pad_sequences(test_questions2_sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [0]:
test_id = np.array(test_data['test_id'])

print('Shape of question1 data: {}'.format(test_q1_data.shape))
print('Shape of question2 data: {}'.format(test_q2_data.shape))
print('Shape of ids: {}'.format(test_id.shape))

Shape of question1 data: (2345796, 31)
Shape of question2 data: (2345796, 31)
Shape of ids: (2345796,)


In [0]:
TEST_Q1_DATA = 'test_q1.npy'
TEST_Q2_DATA = 'test_q2.npy'
TEST_ID_DATA = 'test_id.npy'

np.save(open(DATA_IN_PATH + TEST_Q1_DATA, 'wb'), test_q1_data)
np.save(open(DATA_IN_PATH + TEST_Q2_DATA, 'wb'), test_q2_data)
np.save(open(DATA_IN_PATH + TEST_ID_DATA, 'wb'), test_id)