In [2]:
import pandas as pd
import numpy as np
import re
import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [7]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
okt = Okt()

In [160]:
def sentiment_predict(model, new_sentence, tokenizer):
    new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
    new_sentence = okt.morphs(new_sentence, stem=True)
    new_sentence = [word for word in new_sentence if not word in stopwords]
    encoded = tokenizer.texts_to_sequences([new_sentence])
    pad_new = pad_sequences(encoded, maxlen = 50)
    pad_new = np.array(pad_new)
    score = np.argmax(model.predict(pad_new))
    return score

In [10]:
def save_labeling(filename, sentences, sentiments, objectives):
    with open(filename,"w",encoding="utf-8") as f:
        for sentence, sentiment, objective in zip(sentences, sentiments, objectives):
            f.write(f"{sentence}\t{sentiment}\t{objective}\n")

In [136]:
def training(X, y, output_dim, optimizer_function, loss_function, activation_function):
    embedding_dim = 100
    hidden_units = 128
    model = Sequential()
    model.add(Embedding(50000, embedding_dim, input_length=X.shape[1]))
    model.add(LSTM(hidden_units))
    model.add(Dense(output_dim, activation=activation_function))

    model.compile(optimizer=optimizer_function, loss=loss_function, metrics=['acc'])
    history = model.fit(X, y, epochs=15, batch_size=64, validation_split=0.2)
    return model

In [12]:
def make_X_data(X, tokenizer, labeling_index):
    X = X[:labeling_index]
    X = [re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","",sent) for sent in X]
    X = [okt.morphs(sent, stem=True) for sent in X]
    X = [sent for sent in X if not sent in stopwords]
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=50)
    return X

In [13]:
def make_y_data(y, labeling_index):
    y = y[:labeling_index]
    return y

In [14]:
def get_data_from_path(file_path):
    df = pd.read_csv(file_path, sep='\t' ,names=['sentence','sentiment','objective'])
    sentences = df['sentence'].values
    sentiments = df['sentiment'].values
    objectives = df['objective'].values
    return (sentences, sentiments, objectives)

In [134]:
sentences, sentiments, objectives = get_data_from_path("./data/new_labeling2.txt")

In [135]:
X_train = []
for sentence in sentences:
    tokenized_sentence = okt.morphs(sentence, stem=True)
    stopword_removed_sentence = [word for word in tokenized_sentence if not word in stopwords]
    X_train.append(stopword_removed_sentence)

In [126]:
tokenizer = Tokenizer(50000)
tokenizer.fit_on_texts(X_train)

In [145]:
t_len = len(sentences)

In [146]:
training_sentences = make_X_data(sentences, tokenizer, t_len)

In [147]:
training_sentiments = make_y_data(sentiments, t_len)
training_objectives = make_y_data(objectives, t_len)

In [148]:
training_sentiments = np.nan_to_num(training_sentiments,nan = 1)
training_sentiments = training_sentiments.astype(int)
training_sentiments[:int(len(training_sentiments)/4)]

array([0, 0, 0, ..., 2, 1, 1])

In [149]:
training_objectives[np.isnan(training_objectives)] = 1
training_objectives = training_objectives.astype(int)
training_objectives[:int(len(training_objectives)/4)]

array([1, 1, 1, ..., 0, 1, 0])

In [150]:
def get_multi_y_train(y, num_of_classes: int):
    new_y = np.zeros((y.shape[0], num_of_classes))
    for idx, value in enumerate(y):
        new_y[idx][value] = 1
    return new_y

In [151]:
training_sentiments = get_multi_y_train(training_sentiments,3)
training_sentiments

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [152]:
sentiment_model = training(training_sentences, training_sentiments, 3, 'adam', 'categorical_crossentropy', "softmax")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [163]:
sentiment_predict(sentiment_model,"안녕하세요 정말 좋아요", tokenizer)

2

In [164]:
predicted_sentiments = [sentiment_predict(sentiment_model, sent, tokenizer) for sent in sentences]
predicted_sentiments[:5]

[0, 0, 0, 2, 2]

In [165]:
objective_model = training(training_sentences, training_objectives, 1, 'rmsprop', 'binary_crossentropy', "sigmoid")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [166]:
def objective_predict(model, new_sentence, tokenizer):
    new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
    new_sentence = okt.morphs(new_sentence, stem=True)
    new_sentence = [word for word in new_sentence if not word in stopwords]
    encoded = tokenizer.texts_to_sequences([new_sentence])
    pad_new = pad_sequences(encoded, maxlen = 50)
    pad_new = np.array(pad_new)
    score = model.predict(pad_new)
    return 1 if score > 0.5 else 0

In [167]:
predicted_objectives = [objective_predict(objective_model, sent, tokenizer) for sent in sentences]
predicted_objectives[:10]

[1, 1, 1, 1, 1, 0, 1, 1, 0, 1]

In [226]:
save_labeling("./data/new_labeling3.txt", sentences, predicted_sentiments, predicted_objectives)