In [1]:
import numpy as np
import tensorflow as tf
import pickle
from gensim.models import Word2Vec
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l1, l2
tf.compat.v1.enable_eager_execution()

# Word2Vec 모델 불러오기
word2vec_model = Word2Vec.load("model_skipgram.word2vec")

# 전처리된 데이터에서 'TaggedForms'만 사용하여 전체 텍스트 데이터셋 생성
# 예시에서는 'processed_results'가 사전에 정의되어 있어야 합니다.
with open('processed_results_sample.pickle', 'rb') as handle:
    processed_results = pickle.load(handle)
text_data = [info['TaggedForms'] for _, info in processed_results.items()]

# Keras의 Tokenizer를 사용하여 텍스트 데이터셋 토큰화 및 word_index 생성
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(text_data)

# 모든 시퀀스를 동일한 길이로 패딩
max_len = 500  # 시퀀스의 최대 길이 설정
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# 임베딩 레이어에 설정할 가중치 매트릭스 준비
vocab_size = len(word_index) + 1  # word_index의 크기 + 1
embedding_dim = word2vec_model.vector_size

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_vector = word2vec_model.wv[word]
        embedding_matrix[i] = embedding_vector






In [2]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l1, l2

# Embedding 레이어 생성 및 모델 정의
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(128, activation='relu', kernel_regularizer=l2(0.01)),  # L2 정규화 적용
    Dropout(0.3),
    Dense(3, activation='softmax')
])
model.compile(optimizer='adam',
              loss='categorical_crossentropy',  # 다중 클래스 분류를 위한 손실 함수
              metrics=['accuracy'])

# 모델 요약 출력
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          7938600   
                                                                 
 global_average_pooling1d (  (None, 100)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 128)               12928     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 3)                 387       
                                                                 
Total params: 7951915 (30.33 MB)
Trainable params: 7951915 (30.33 MB)
Non-trainable params: 0 (0.00 Byte)
______________

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from mecab import MeCab
mecab = MeCab()

In [4]:
# 데이터 로딩
train_df = pd.read_csv("train.tsv", delimiter='\t')
dev_df = pd.read_csv("dev.tsv", delimiter='\t')


def process_batch(batch):
    """배치 처리 로직을 구현합니다. comments 필드에 대해 형태소 분석 및 품사 태깅을 수행합니다."""
    processed_batch_val = {}
    for idx, info in batch.items():
        processed_info_val = {
            'comments': ' '.join([f'{word}/{tag}' for word, tag in mecab.pos(info['comments'])]),
            'contain_gender_bias': info['contain_gender_bias'],
            'bias': info['bias'],
            'hate': info['hate']
        }
        processed_batch_val[idx] = processed_info_val
    return processed_batch_val

# 전처리 함수 적용
train_data = {idx: row for idx, row in train_df.iterrows()}
dev_data = {idx: row for idx, row in dev_df.iterrows()}
processed_train_data = process_batch(train_data)
processed_dev_data = process_batch(dev_data)

# 전처리된 텍스트 데이터셋 생성
train_texts = [info['comments'] for _, info in processed_train_data.items()]
dev_texts = [info['comments'] for _, info in processed_dev_data.items()]

# Tokenizer를 사용하여 텍스트 데이터셋 토큰화 및 word_index 생성
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
dev_sequences = tokenizer.texts_to_sequences(dev_texts)

# 모든 시퀀스를 동일한 길이로 패딩
train_padded = pad_sequences(train_sequences, maxlen=500, padding='post')
dev_padded = pad_sequences(dev_sequences, maxlen=500, padding='post')




In [5]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# 레이블 인코더 초기화 및 인코딩 수행
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df['hate'])
dev_labels_encoded = label_encoder.transform(dev_df['hate'])

# 원-핫 인코딩으로 변환
train_labels = to_categorical(train_labels_encoded)
dev_labels = to_categorical(dev_labels_encoded)



In [6]:
print(train_padded.shape)
print(dev_padded.shape)

(7896, 500)
(471, 500)


In [10]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ModelCheckpoint

X = train_padded
y = train_labels

# 교차 검증을 위한 KFold 설정
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []  # 각 폴드의 정확도를 저장할 리스트
fold_no = 1

for train, test in kfold.split(X, np.argmax(y, axis=1)):
    model = Sequential([
        Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len),
        GlobalAveragePooling1D(),
        Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        Dropout(0.3),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    print(f'Training for fold {fold_no} ...')
    
    model.fit(X[train], y[train], epochs=50, batch_size=32, verbose=0, validation_data=(X[test], y[test]))
    
    # 모델 평가 및 정확도 저장
    scores = model.evaluate(X[test], y[test], verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%\n')
    accuracies.append(scores[1] * 100)
    
    fold_no += 1

# 평균 정확도와 표준편차 계산 및 출력
print(f'Accuracy over all folds: {np.mean(accuracies):.2f}% (+/- {np.std(accuracies):.2f}%)')



Training for fold 1 ...
Score for fold 1: loss of 1.127647876739502; accuracy of 55.506330728530884%

Training for fold 2 ...
Score for fold 2: loss of 1.0474457740783691; accuracy of 58.70804190635681%

Training for fold 3 ...
Score for fold 3: loss of 1.0565822124481201; accuracy of 57.948070764541626%

Training for fold 4 ...
Score for fold 4: loss of 1.041219711303711; accuracy of 59.65800881385803%

Training for fold 5 ...
Score for fold 5: loss of 1.0490326881408691; accuracy of 59.59467887878418%

Accuracy over all folds: 58.28% (+/- 1.52%)
