In [1]:
# 필요 라이브러리 호출
import numpy as np
import pandas as pd
from konlpy.tag import Okt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 불러오기
df = pd.read_csv('data/naver_comment.csv', encoding='utf8')
df.head()

Unnamed: 0,title,score,score_num,best_num,best,best_recomm,best_unrecomm,comment_num,comment,star
0,엑스트라 데이즈,9.62,101,6,아 근데 실제로 저런 기술이 있다면 쫌 싸할것같다.. 내가 죽어도 사람들은 나랑 똑...,126,0,42,버키 왜 하나만 블러예욬ㅋㅋㅋㅋㅋ\n이 웹툰은 개재밌을거라고 나한테 텔레파시가왔다\...,9.65
1,원수를 사랑하라,9.95,31400,15,근데 여주 대단하다...엄마랑 아빠가 저런 사람들인데 멀끔히 차려입고 면접보는 거면...,129323,3789,984,그림체 마음에 드네 \n그림체 미친\n면접관중에 존나 보기싫은얼굴이보이는데?\n누가...,9.92
2,우투리,8.5,159,4,"학원액션물 만화계의 거장 임재원님의 2번째 작품 우투리입니다.... 스토리, 그림체...",194,5,51,진짜 퀄리티 말도 안되는데 덴마 이후로 미완결 작품은 그냥 혐오스럽다\n우와아아아!...,9.2
3,롭플롭,9.78,290,5,"나 이 작가 작품 좋아하네.... 잭슨의 관, 데빌샷 보고 바로 들어옴 \n잭슨의 ...",410,2,64,작가님 어케 이렇게 그천이신거에요\n노랑머리 친구 명암때문인진 몰라도장례식 이후에 ...,9.91
4,약초마을 연쇄살초사건,9.73,10226,15,아니ㅋㅋㅋㅋㅋㅋ작가님ㅋㅋㅋㅋㅋㅋ이제 작물 쪽으로 길을 트신건가요ㅋㅋㅋㅋㅋㅋㅋㅋ\n팀...,62851,254,1463,ㅋㅋㅋㅋㅋㅋㅋㅋ\n조낸 무서운 이야기인데 약초란 사실과 그림체 때문에 잔혹함이 전달...,9.87


In [3]:
okt = Okt()
def okt_tokenizer(text):
    tokens_ko = okt.morphs(text, stem=True)
    return tokens_ko

import re

def hangul(text):
    return re.sub("[^가-힣ㄱ-하-ㅣ\\s]", "", text)

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

def train_to_evaluate(X, y, file_name, max_words=10000, maxlen=30, embedding_dim=100, *args):
    
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(X)
    X_seq = tokenizer.texts_to_sequences(X)

    X_pad = pad_sequences(X_seq, maxlen=maxlen)
    y = np.asarray(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    model.add(Flatten())
    for unit in args:
        model.add(Dense(unit, activation='relu'))
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    cp = ModelCheckpoint(filepath='data/'+file_name,
                         monitor='val_loss',
                         save_best_only=True)
    
    es = EarlyStopping(monitor='val_loss',
                       patience=15)
    
    
    hist = model.fit(X_train, y_train,
                     epochs=100, batch_size=32,
                     validation_data=(X_val, y_val),
                     callbacks=[cp, es],
                     verbose=0)
    
    
    model.load_weights('data/'+file_name)
    mse, mae = model.evaluate(X_val, y_val, verbose=0)
#     print('MAE :', np.round(mae, 4), 'RMSE :', np.round(np.sqrt(mse), 4))
        
    return model, hist, mse, mae

# 1. 베댓 + 최근 댓글

In [5]:
X1 = df['best'] + '\n' + df['comment']
y = df['star']

X1 = X1.apply(okt_tokenizer).values

In [6]:
max([len(x) for x in X1])

1340

In [7]:
min([len(x) for x in X1])

151

## IMDB로 연습할 때와 동일한 모델로 학습
- `num_words=10000`, `maxlen=30`, `embedding_dim=100`, `Dense(32)`
- 결과는 매우 좋지 않음(데이터 수가 적어서, 토큰화 된 단어의 수가 적어서, 영어와 달리 한국어는 적은 단어만 사용돼서?)

In [8]:
model1, hist1, mse1, mae1 = train_to_evaluate(X1, y, 'embedding1.h5', 10000, 30, 100, 32)

In [9]:
print('MAE :', np.round(mae1, 4), 'RMSE :', np.round(np.sqrt(mse1), 4))

MAE : 1.0712 RMSE : 1.355


## 파라미터 변경
- `num_words=30`, `maxlen=20`, `embedding_dim=30`, `Dense` 없음
- TF-IDF 변환 후 LGBM/XGBoost/RandomForest로 학습한 때와 비슷한 수준의 결과가 나옴

In [10]:
model1, hist1, mse1, mae1 = train_to_evaluate(X1, y, 'embedding1.h5', 30, 20, 30)

In [11]:
print('MAE :', np.round(mae1, 4), 'RMSE :', np.round(np.sqrt(mse1), 4))

MAE : 0.2633 RMSE : 0.4362


파라미터 값이 작아야 좋은 결과가 나오는 것 같기 때문에 최적 파라미터를 모든 경우의 수를 적용해서 찾아봄

In [12]:
from tqdm import tqdm

bestmodel = None
bestloss = 9999
bestmxwords = None
bestmxlen = None
bestembdim = None
bestunit = None
result_df = pd.DataFrame(columns=['maxwords', 'maxlen', 'embedding_dim', 'unit', 'mse', 'mae'])

for mxwords in tqdm(range(20, 200+1, 20)):
    for mxlen in [20, 30]:
        for embdim in range(25, 100+1, 25):
            for unit in [5, 16, 32, 48, 64]:
                model = train_to_evaluate(X1, y, 'embedding1.h5', mxwords, mxlen, embdim, unit)
                result_df.loc[len(result_df)] = [mxwords, mxlen, embdim, unit, model[2], model[3]]
                if bestloss > model[2]:
                    bestmodel = model
                    bestloss = model[2]
                    bestmxwords = mxwords
                    bestmxlen = mxlen
                    bestembdim = embdim
                    bestunit = unit

100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [22:17<00:00, 133.74s/it]


In [13]:
print(f'MAE: {bestmodel[3]:.4f}, RMSE: {np.sqrt(bestmodel[2]):.4f}')
print(f'--bestmxwords: {bestmxwords}', f'--bestmxlen: {bestmxlen}', f'--bestembdim: {bestembdim}', f'--bestunit: {bestunit}', sep='\t')

MAE: 0.2536, RMSE: 0.4329
--bestmxwords: 20	--bestmxlen: 20	--bestembdim: 25	--bestunit: 32


## 특문 제거

In [14]:
X1_a = df['best'].apply(hangul)
X1_a = X1_a.apply(okt_tokenizer).values

In [15]:
bestmodel = None
bestloss = 9999
bestmxwords = None
bestmxlen = None
bestembdim = None
bestunit = None
result_df = pd.DataFrame(columns=['maxwords', 'maxlen', 'embedding_dim', 'unit', 'mse', 'mae'])

for mxwords in tqdm(range(20, 200+1, 20)):
    for mxlen in [20, 30]:
        for embdim in range(25, 100+1, 25):
            for unit in [5, 16, 32, 48, 64]:
                model = train_to_evaluate(X1_a, y, 'embedding1.h5', mxwords, mxlen, embdim, unit)
                result_df.loc[len(result_df)] = [mxwords, mxlen, embdim, unit, model[2], model[3]]
                if bestloss > model[2]:
                    bestmodel = model
                    bestloss = model[2]
                    bestmxwords = mxwords
                    bestmxlen = mxlen
                    bestembdim = embdim
                    bestunit = unit

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [16:26<00:00, 98.69s/it]


In [16]:
print(f'MAE: {bestmodel[3]:.4f}, RMSE: {np.sqrt(bestmodel[2]):.4f}')
print(f'--bestmxwords: {bestmxwords}', f'--bestmxlen: {bestmxlen}', f'--bestembdim: {bestembdim}', f'--bestunit: {bestunit}', sep='\t')

MAE: 0.2595, RMSE: 0.4420
--bestmxwords: 20	--bestmxlen: 30	--bestembdim: 75	--bestunit: 32


# 2. 베댓만

## 2-1. Dense 층 없이

In [17]:
X2 = df['best']

X2 = X2.apply(okt_tokenizer).values

In [18]:
bestmodel = None
bestloss = 9999
bestmxwords = None
bestmxlen = None
bestembdim = None
bestunit = None
result_df = pd.DataFrame(columns=['maxwords', 'maxlen', 'embedding_dim', 'mse', 'mae'])

for mxwords in tqdm(range(10, 200+1, 10)):
    for mxlen in [20, 25, 30]:
        for embdim in range(10, 100+1, 5):
            model = train_to_evaluate(X1, y, 'embedding2.h5', mxwords, mxlen, embdim)
            result_df.loc[len(result_df)] = [mxwords, mxlen, embdim, model[2], model[3]]
            if bestloss > model[2]:
                bestmodel = model
                bestloss = model[2]
                bestmxwords = mxwords
                bestmxlen = mxlen
                bestembdim = embdim

100%|███████████████████████████████████████████████████████████████████████████████| 20/20 [1:17:43<00:00, 233.17s/it]


In [19]:
print(f'MAE: {bestmodel[3]:.4f}, RMSE: {np.sqrt(bestmodel[2]):.4f}')
print(f'--bestmxwords: {bestmxwords}', f'--bestmxlen: {bestmxlen}', f'--bestembdim: {bestembdim}', sep='\t')

MAE: 0.2563, RMSE: 0.4193
--bestmxwords: 30	--bestmxlen: 20	--bestembdim: 15


In [20]:
result_df.sort_values(by=['mse']).head(10)

Unnamed: 0,maxwords,maxlen,embedding_dim,mse,mae
115,30.0,20.0,15.0,0.175843,0.256312
132,30.0,20.0,100.0,0.180809,0.269051
134,30.0,25.0,15.0,0.181772,0.260827
125,30.0,20.0,65.0,0.181852,0.264862
121,30.0,20.0,45.0,0.182059,0.260396
145,30.0,25.0,70.0,0.1825,0.26311
133,30.0,25.0,10.0,0.182667,0.263757
123,30.0,20.0,55.0,0.182885,0.261376
150,30.0,25.0,95.0,0.182979,0.26517
130,30.0,20.0,90.0,0.183198,0.270388


In [21]:
result_df.sort_values(by=['mae']).head(10)

Unnamed: 0,maxwords,maxlen,embedding_dim,mse,mae
82,20.0,25.0,40.0,0.191705,0.243128
88,20.0,25.0,70.0,0.189174,0.245186
89,20.0,25.0,75.0,0.187568,0.246905
99,20.0,30.0,30.0,0.191793,0.246933
78,20.0,25.0,20.0,0.19249,0.247627
85,20.0,25.0,55.0,0.196138,0.247802
161,30.0,30.0,55.0,0.188838,0.248652
83,20.0,25.0,45.0,0.19689,0.249093
87,20.0,25.0,65.0,0.192553,0.249744
86,20.0,25.0,60.0,0.19411,0.251012


## 특문 제거

In [22]:
X2_a = df['best'].apply(hangul)
X2_a = X2_a.apply(okt_tokenizer).values

In [23]:
from tqdm.auto import tqdm

bestmodel = None
bestloss = 9999
bestmxwords = None
bestmxlen = None
bestembdim = None
bestunit = None
result_df = pd.DataFrame(columns=['maxwords', 'maxlen', 'embedding_dim', 'mse', 'mae'])

for mxwords in tqdm(range(10, 200+1, 10)):
    for mxlen in [20, 25, 30]:
        for embdim in range(10, 100+1, 5):
            model = train_to_evaluate(X2, y, 'embedding2a.h5', mxwords, mxlen, embdim)
            result_df.loc[len(result_df)] = [mxwords, mxlen, embdim, model[2], model[3]]
            if bestloss > model[2]:
                bestmodel = model
                bestloss = model[2]
                bestmxwords = mxwords
                bestmxlen = mxlen
                bestembdim = embdim

  0%|          | 0/20 [00:00<?, ?it/s]

In [24]:
print(f'MAE: {bestmodel[3]:.4f}, RMSE: {np.sqrt(bestmodel[2]):.4f}')
print(f'--bestmxwords: {bestmxwords}', f'--bestmxlen: {bestmxlen}', f'--bestembdim: {bestembdim}', sep='\t')

MAE: 0.2688, RMSE: 0.4425
--bestmxwords: 20	--bestmxlen: 20	--bestembdim: 95


In [27]:
result_df.sort_values(by=['mse']).head(10)

Unnamed: 0,maxwords,maxlen,embedding_dim,mse,mae
74,20.0,20.0,95.0,0.19584,0.268801
142,30.0,25.0,55.0,0.195933,0.261722
149,30.0,25.0,90.0,0.197089,0.265945
135,30.0,25.0,20.0,0.197325,0.263685
70,20.0,20.0,75.0,0.197533,0.27528
66,20.0,20.0,55.0,0.197759,0.272899
68,20.0,20.0,65.0,0.19849,0.26398
112,20.0,30.0,95.0,0.199313,0.270171
72,20.0,20.0,85.0,0.19939,0.265917
133,30.0,25.0,10.0,0.200083,0.263669


In [28]:
result_df.sort_values(by=['mae']).head(10)

Unnamed: 0,maxwords,maxlen,embedding_dim,mse,mae
65,20.0,20.0,50.0,0.200875,0.259378
64,20.0,20.0,45.0,0.201835,0.259566
155,30.0,30.0,25.0,0.205375,0.259748
33,10.0,25.0,80.0,0.210077,0.260143
152,30.0,30.0,10.0,0.208282,0.260736
142,30.0,25.0,55.0,0.195933,0.261722
81,20.0,25.0,35.0,0.205634,0.26234
133,30.0,25.0,10.0,0.200083,0.263669
164,30.0,30.0,70.0,0.207067,0.263674
135,30.0,25.0,20.0,0.197325,0.263685


## 2-2. Dense 층 추가

In [29]:
bestmodel = None
bestloss = 9999
bestmxwords = None
bestmxlen = None
bestembdim = None
bestunit = None
result_df = pd.DataFrame(columns=['maxwords', 'maxlen', 'embedding_dim', 'unit', 'mse', 'mae'])

for mxwords in tqdm(range(20, 60+1, 10)):
    for mxlen in [20, 30]:
        for embdim in range(10, 100+1, 5):
            for unit in [16, 32, 48, 64]:
                model = train_to_evaluate(X2, y, 'embedding2b.h5', mxwords, mxlen, embdim, unit)
                result_df.loc[len(result_df)] = [mxwords, mxlen, embdim, unit, model[2], model[3]]
                if bestloss > model[2]:
                    bestmodel = model
                    bestloss = model[2]
                    bestmxwords = mxwords
                    bestmxlen = mxlen
                    bestembdim = embdim
                    bestunit = unit

  0%|          | 0/5 [00:00<?, ?it/s]

In [30]:
print(f'MAE: {bestmodel[3]:.4f}, RMSE: {np.sqrt(bestmodel[2]):.4f}')
print(f'--bestmxwords: {bestmxwords}', f'--bestmxlen: {bestmxlen}', f'--bestembdim: {bestembdim}', f'--bestunit: {bestunit}', sep='\t')

MAE: 0.2623, RMSE: 0.4378
--bestmxwords: 20	--bestmxlen: 20	--bestembdim: 55	--bestunit: 64


In [31]:
result_df.sort_values(by=['mse']).head(10)

Unnamed: 0,maxwords,maxlen,embedding_dim,unit,mse,mae
39,20.0,20.0,55.0,64.0,0.19165,0.262344
25,20.0,20.0,40.0,32.0,0.192019,0.267659
111,20.0,30.0,50.0,64.0,0.192678,0.279257
32,20.0,20.0,50.0,16.0,0.193773,0.272716
34,20.0,20.0,50.0,48.0,0.194054,0.270079
41,20.0,20.0,60.0,32.0,0.194402,0.279883
142,20.0,30.0,90.0,48.0,0.194666,0.283536
108,20.0,30.0,50.0,16.0,0.194907,0.288152
53,20.0,20.0,75.0,32.0,0.195314,0.268854
15,20.0,20.0,25.0,64.0,0.195502,0.277237


In [32]:
result_df.sort_values(by=['mae']).head(10)

Unnamed: 0,maxwords,maxlen,embedding_dim,unit,mse,mae
75,20.0,20.0,100.0,64.0,0.198834,0.258456
19,20.0,20.0,30.0,64.0,0.202728,0.259561
55,20.0,20.0,75.0,64.0,0.209863,0.259644
5,20.0,20.0,15.0,32.0,0.197696,0.26026
66,20.0,20.0,90.0,48.0,0.204186,0.260663
8,20.0,20.0,20.0,16.0,0.196922,0.261342
39,20.0,20.0,55.0,64.0,0.19165,0.262344
49,20.0,20.0,70.0,32.0,0.202179,0.262471
156,30.0,20.0,15.0,16.0,0.205145,0.262776
144,20.0,30.0,95.0,16.0,0.203727,0.262861


## 특문 제거

In [33]:
bestmodel = None
bestloss = 9999
bestmxwords = None
bestmxlen = None
bestembdim = None
bestunit = None
result_df = pd.DataFrame(columns=['maxwords', 'maxlen', 'embedding_dim', 'unit', 'mse', 'mae'])

for mxwords in tqdm(range(20, 60+1, 10)):
    for mxlen in [10, 15, 20, 25, 30]:
        for embdim in range(10, 100+1, 5):
            for unit in [16, 32, 48, 64]:
                model = train_to_evaluate(X2_a, y, 'embedding2c.h5', mxwords, mxlen, embdim, unit)
                result_df.loc[len(result_df)] = [mxwords, mxlen, embdim, unit, model[2], model[3]]
                if bestloss > model[2]:
                    bestmodel = model
                    bestloss = model[2]
                    bestmxwords = mxwords
                    bestmxlen = mxlen
                    bestembdim = embdim
                    bestunit = unit

  0%|          | 0/5 [00:00<?, ?it/s]

In [34]:
print(f'MAE: {bestmodel[3]:.4f}, RMSE: {np.sqrt(bestmodel[2]):.4f}')
print(f'--bestmxwords: {bestmxwords}', f'--bestmxlen: {bestmxlen}', f'--bestembdim: {bestembdim}', f'--bestunit: {bestunit}', sep='\t')

MAE: 0.2624, RMSE: 0.4286
--bestmxwords: 30	--bestmxlen: 20	--bestembdim: 90	--bestunit: 32


In [35]:
result_df.sort_values(by=['mse']).head(10)

Unnamed: 0,maxwords,maxlen,embedding_dim,unit,mse,mae
597,30.0,20.0,90.0,32.0,0.183688,0.262411
548,30.0,20.0,30.0,16.0,0.190082,0.253869
589,30.0,20.0,80.0,32.0,0.190678,0.270763
546,30.0,20.0,25.0,48.0,0.191789,0.260937
538,30.0,20.0,15.0,48.0,0.191804,0.262275
541,30.0,20.0,20.0,32.0,0.192657,0.257827
549,30.0,20.0,30.0,32.0,0.19272,0.258495
570,30.0,20.0,55.0,48.0,0.193526,0.269716
1037,40.0,25.0,70.0,32.0,0.193974,0.278957
599,30.0,20.0,90.0,64.0,0.19401,0.253719


In [36]:
result_df.sort_values(by=['mae']).head(10)

Unnamed: 0,maxwords,maxlen,embedding_dim,unit,mse,mae
575,30.0,20.0,60.0,64.0,0.197721,0.24855
571,30.0,20.0,55.0,64.0,0.198871,0.251278
307,20.0,30.0,10.0,64.0,0.197254,0.253601
599,30.0,20.0,90.0,64.0,0.19401,0.253719
548,30.0,20.0,30.0,16.0,0.190082,0.253869
328,20.0,30.0,40.0,16.0,0.202181,0.254231
170,20.0,20.0,30.0,48.0,0.202592,0.255135
348,20.0,30.0,65.0,16.0,0.204938,0.255418
325,20.0,30.0,35.0,32.0,0.205945,0.255817
308,20.0,30.0,15.0,16.0,0.204639,0.256404


# 테스트

In [6]:
'''
MAE: 0.2563, RMSE: 0.4193
--bestmxwords: 30	--bestmxlen: 20	--bestembdim: 15
'''
max_words = 30
maxlen = 20

from tensorflow.keras.models import load_model

X = df['best']

X = X.apply(okt_tokenizer).values

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

In [15]:
y = df['star']

model = train_to_evaluate(X, y, 'embedding2.h5', 30, 20, 15)

In [21]:
X_seq = tokenizer.texts_to_sequences([okt_tokenizer('와 진짜 재밌음 작가님 기대하고 있을게요')])

X_pad = pad_sequences(X_seq, maxlen=maxlen)

In [22]:
model = load_model('data/embedding2.h5')
pred = model.predict(X_pad)

In [23]:
pred

array([[9.32203]], dtype=float32)

In [24]:
X_seq = tokenizer.texts_to_sequences([okt_tokenizer('와 진짜 재미없네 이 작가 작품 다 별로임')])

X_pad = pad_sequences(X_seq, maxlen=maxlen)
pred = model.predict(X_pad)
pred

array([[9.292497]], dtype=float32)

In [25]:
X_seq = tokenizer.texts_to_sequences([okt_tokenizer('이거 뭐냐 왤케 잼씀 ㅋㅋㅋ')])

X_pad = pad_sequences(X_seq, maxlen=maxlen)
pred = model.predict(X_pad)
pred

array([[9.178808]], dtype=float32)

성능은 괜찮아 보여도 실제로 넣어보면 아닌 것 같다. 아무래도 중앙값이 9.92점으로 높은 점수에 편중되어 있는 y값이라 그런 듯 하다.  
차라리 점수 예측 대신에 임의로 점수를 카테고리화 하여 분류 예측을 해보는 것이 나을 수도 있을 것 같다.