In [199]:
# 학습 데이터 불러오기
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import os
import json
from tqdm import tqdm
import re
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer


#전처리 데이터 불러오기
DATA_PATH = 'data/nsmc-master/CLEAN_DATA/'
DATA_OUT = 'data/nsmc-master/DATA_OUT/'
INPUT_TRAIN_DATA = 'nsmc_train_input.npy'
LABEL_TRAIN_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs.json'

train_input = np.load(open(DATA_PATH + INPUT_TRAIN_DATA,'rb'))
train_input = pad_sequences(train_input,maxlen=train_input.shape[1])
train_label = np.load(open(DATA_PATH + LABEL_TRAIN_DATA,'rb'))
prepro_configs = json.load(open(DATA_PATH+DATA_CONFIGS,'r'))

In [200]:
model_name= 'cnn_classifier_kr'
BATCH_SIZE = 512
NUM_EPOCHS = 10
VALID_SPLIT = 0.1
MAX_LEN = train_input.shape[1]

kargs={'model_name': model_name, 
        'vocab_size':prepro_configs['vocab_size'],
        'embbeding_size':128, 
        'num_filters':100,
        'dropout_rate':0.5, 
        'hidden_dimension':250,
        'output_dimension':1}

In [201]:
class CNNClassifier(tf.keras.Model):

  def __init__(self, **kargs):
    super(CNNClassifier, self).__init__(name=kargs['model_name'])
    self.embedding = layers.Embedding(input_dim=kargs['vocab_size'], output_dim=kargs['embbeding_size'])
    self.conv_list = [layers.Conv1D(filters=kargs['num_filters'], kernel_size=kernel_size, padding='valid',activation = tf.keras.activations.relu,
                                    kernel_constraint = tf.keras.constraints.MaxNorm(max_value=3)) for kernel_size in [3,4,5]]
    self.pooling = layers.GlobalMaxPooling1D()
    self.dropout = layers.Dropout(kargs['dropout_rate'])
    self.fc1 = layers.Dense(units=kargs['hidden_dimension'],
                            activation = tf.keras.activations.relu,
                            kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
    self.fc2 = layers.Dense(units=kargs['output_dimension'],
                            activation=tf.keras.activations.sigmoid,
                            kernel_constraint= tf.keras.constraints.MaxNorm(max_value=3.))
    

  def call(self,x):
    x = self.embedding(x)
    x = self.dropout(x)
    x = tf.concat([self.pooling(conv(x)) for conv in self.conv_list], axis = 1)
    x = self.fc1(x)
    x = self.fc2(x)
    return x

4. 학습하기
에포크는 10으로 주어 학습을 진행하고, 검증 정확도가 그전보다 낮아지면 학습을 멈추도록 설계하였습니다.

In [202]:
model = CNNClassifier(**kargs)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy')])


In [203]:
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'
SAVE_FILE_NM = 'weights.h5'

test_input = np.load(open(DATA_PATH+INPUT_TEST_DATA,'rb'))
test_input = pad_sequences(test_input,maxlen=test_input.shape[1])
test_label_data = np.load(open(DATA_PATH + LABEL_TEST_DATA, 'rb'))


In [204]:
model.built = True
model.call(test_input)
model.load_weights('data/nsmc-master/DATA_OUT/cnn_classifier_kr/weights.h5')
model.evaluate(test_input, test_label_data)



[0.381698876619339, 0.8278399705886841]

In [205]:
okt = Okt()
tokenizer  = Tokenizer()
tokenizer.fit_on_texts(prepro_configs["vocab"])

In [206]:
MAX_LENGTH = 8 #문장최대길이

sentence = "진짜 이거는 16강 몰라"
# input('고맙습니다.: ')

In [290]:
def clean_sentence(sentence):
    sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\\s ]','', sentence)
    stopwords = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한'] # 불용어 추가할 것이 있으면 이곳에 추가
    sentence = okt.morphs(sentence, stem=True) # 토큰화
    sentence = [word for word in sentence if not word in stopwords] # 불용어 제거
    # print(sentence)
    vector  = tokenizer.texts_to_sequences(sentence)
    if len(vector) == 0:
        vector = [np.zeros(MAX_LENGTH)]
    pad_new = pad_sequences(vector, maxlen = MAX_LENGTH) # 패딩

    return pad_new

In [305]:
def predict_sent(sentence):
    pad_new = clean_sentence(sentence)
    predictions = model.predict(pad_new)
    predictions = np.average(predictions)
    
    if(predictions > 0.5):
        print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(predictions * 100))
    else:
        print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - predictions) * 100))

In [292]:
# 섹션 4 11월 9일부터 - 11월 29일까지

zoom = pd.read_csv("data/clean/zoom_total.csv",index_col=0)
zoom = zoom.dropna()
zoom.head()

Unnamed: 0,Date,Time,Author,Content
0,2022-10-17,11:30:24,AI_15_이상필,안녕하세요~^-^
1,2022-10-17,11:30:25,AI_15_전현아,안녕하세요~
2,2022-10-17,11:30:28,AI_15_윤종률,안녕하세요
3,2022-10-17,11:30:30,AI_15_이세윤,안녕하세요~
4,2022-10-17,11:30:33,AI_15_박난,안녕하세요


In [293]:
zoom_s4 = zoom[zoom["Date"] >= "2022-11-09"]
zoom_s4.head()

Unnamed: 0,Date,Time,Author,Content
8889,2022-11-09,11:30:23,AI_15_최준영,안녕하세요
8890,2022-11-09,11:30:25,AI_15_이도연,안녕하세용
8891,2022-11-09,11:30:25,AI_15_이태섭,안녕하세요
8892,2022-11-09,11:30:30,AI_15_전현아,안녕하세요~
8893,2022-11-09,11:30:32,AI_15_윤종률,안녕하세요


In [294]:
zoom_s4_1109 = zoom[zoom["Date"] == "2022-11-09"]

In [307]:
def predict_zoom(df):
    df_copy = df.copy()
    # print(df_copy.shape)
    lst_label = []
    lst_proba = []
    for line in df_copy["Content"]:
        # print(line)
        pad_new = clean_sentence(line)
        predictions = model.predict(pad_new)
        predictions = np.average(predictions)
        
        if(predictions > 0.5):
            # print("{:.2f}% 확률로 긍정 리뷰입니다.\n".format(predictions * 100))
            lst_label.append(1)
        else:
            # print("{:.2f}% 확률로 부정 리뷰입니다.\n".format((1 - predictions) * 100))
            lst_label.append(0)
            
        lst_proba.append(predictions)
    # print(len(lst_label),len(lst_proba))
    # print(lst_label,lst_proba)
    df_copy["Label"] = lst_label
    df_copy["Proba"] = lst_proba
    return df_copy

In [308]:
s4_res = predict_zoom(zoom_s4)
s4_res.head()

Unnamed: 0,Date,Time,Author,Content,Label,Proba
8889,2022-11-09,11:30:23,AI_15_최준영,안녕하세요,0,0.481098
8890,2022-11-09,11:30:25,AI_15_이도연,안녕하세용,0,0.481098
8891,2022-11-09,11:30:25,AI_15_이태섭,안녕하세요,0,0.481098
8892,2022-11-09,11:30:30,AI_15_전현아,안녕하세요~,0,0.481098
8893,2022-11-09,11:30:32,AI_15_윤종률,안녕하세요,0,0.481098


In [310]:
s4_res.to_csv("data/zoom_s4.csv")