In [3]:
import pandas as pd
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re

In [2]:
train_df = pd.read_csv("./data/ratings_train.txt", sep = "\t")
test_df = pd.read_csv("./data/ratings_test.txt", sep = "\t")

In [3]:
train_df.shape, test_df.shape

((150000, 3), (50000, 3))

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


In [5]:
# 결측치 확인
train_df.isna().sum()

id          0
document    5
label       0
dtype: int64

In [6]:
test_df.isna().sum()

id          0
document    3
label       0
dtype: int64

In [7]:
train_df = train_df.dropna(subset = ["document"])
test_df = test_df.dropna(subset = ["document"])

In [8]:
train_df.shape, test_df.shape

((149995, 3), (49997, 3))

In [9]:
# 종속변수 확인
np.unique(train_df["label"], return_counts = True)

(array([0, 1], dtype=int64), array([75170, 74825], dtype=int64))

In [10]:
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


# 데이터 전처리

In [11]:
# 한글 이외의 문자들 제거
train_df["document"] = train_df["document"].map(lambda x: re.sub("[^ㄱ-ㅎ ㅏ-ㅣ 가-힣 ]", "", x))
test_df["document"] = test_df["document"].map(lambda x: re.sub("[^ㄱ-ㅎ ㅏ-ㅣ 가-힣 ]", "", x))

In [12]:
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1


In [13]:
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,,0
2,8544678,뭐야 이 평점들은 나쁘진 않지만 점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임 돈주고 보기에는,0
4,6723715,만 아니었어도 별 다섯 개 줬을텐데 왜 로 나와서 제 심기를 불편하게 하죠,0


In [14]:
train_df = train_df[train_df["document"].map(lambda x: len(x.strip()) >= 1)]
test_df = test_df[test_df["document"].map(lambda x: len(x.strip()) >= 1)]

In [15]:
train_df.shape, test_df.shape

((148740, 3), (49575, 3))

In [16]:
# 중복 데이터 확인
train_df[train_df["document"].duplicated(keep = False)].sort_values("document")

Unnamed: 0,id,document,label
46599,9682597,그리고 내 감정을 불러 일으켰다,1
43436,9582856,그리고 내 감정을 불러 일으켰다,1
123713,9582855,그리고 내 감정을 불러 일으켰다,1
93364,171409,가입 추천바람,1
138373,171407,가입 추천바람,1
...,...,...,...
57831,3906478,흥미진진,1
8446,5158304,힐러리 더프의 매력에 빠지다,1
72688,5153363,힐러리 더프의 매력에 빠지다,1
26889,7971814,힘들다,0


In [17]:
# 중복제거
train_df = train_df.drop_duplicates(subset = ["document"], keep ="first")
test_df = test_df.drop_duplicates(subset = ["document"], keep ="first")

In [18]:
train_df.shape, test_df.shape

((143660, 3), (48403, 3))

# 토큰화

In [19]:
okt = Okt()

In [20]:
okt.morphs("아 더빙 진짜 짜증나네요 목소리", stem = False)

['아', '더빙', '진짜', '짜증나네요', '목소리']

In [21]:
okt.morphs("아 더빙 진짜 짜증나네요 목소리", stem = True)

['아', '더빙', '진짜', '짜증나다', '목소리']

In [22]:
okt.nouns("아 더빙 진짜 짜증나네요 목소리")

['더빙', '진짜', '목소리']

In [23]:
okt.pos("아 더빙 진짜 짜증나네요 목소리")

[('아', 'Exclamation'),
 ('더빙', 'Noun'),
 ('진짜', 'Noun'),
 ('짜증나네요', 'Adjective'),
 ('목소리', 'Noun')]

In [24]:
%%time
train_df["token"] = train_df["document"].map(lambda x: okt.morphs(x, stem = True))
test_df["token"] = test_df["document"].map(lambda x: okt.morphs(x, stem = True))

CPU times: total: 22min 43s
Wall time: 21min 58s


In [27]:
train_df.head(20)

Unnamed: 0,id,document,label,token
0,9976970,아 더빙 진짜 짜증나네요 목소리,0,"[아, 더빙, 진짜, 짜증나다, 목소리]"
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1,"[흠, 포스터, 보고, 초딩, 영화, 줄, 오버, 연기, 조차, 가볍다, 않다]"
2,10265843,너무재밓었다그래서보는것을추천한다,0,"[너, 무재, 밓었, 다그, 래서, 보다, 추천, 한, 다]"
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0,"[교도소, 이야기, 구먼, 솔직하다, 재미, 는, 없다, 평점, 조정]"
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1,"[사이, 몬페, 그, 의, 익살스럽다, 연기, 가, 돋보이다, 영화, 스파이더맨, ..."
5,5403919,막 걸음마 뗀 세부터 초등학교 학년생인 살용영화ㅋㅋㅋ별반개도 아까움,0,"[막, 걸음, 마, 떼다, 세, 부터, 초등학교, 학년, 생인, 살다, 영화, ㅋㅋ..."
6,7797314,원작의 긴장감을 제대로 살려내지못했다,0,"[원작, 의, 긴장감, 을, 제대로, 살리다, 하다]"
7,9443947,별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지정말 발로해도 그것보단 낫...,0,"[별, 반개, 도, 아깝다, 욕, 나오다, 이응경, 길용우, 연, 기, 생활, 이,..."
8,7156791,액션이 없는데도 재미 있는 몇안되는 영화,1,"[액션, 이, 없다, 재미, 있다, 몇, 안되다, 영화]"
9,5912145,왜케 평점이 낮은건데 꽤 볼만한데 헐리우드식 화려함에만 너무 길들여져 있나,1,"[왜케, 평점, 이, 낮다, 꽤, 볼, 만, 한, 데, 헐리우드, 식, 화려하다, ..."


In [26]:
train_df.to_csv("nsmc_ratings_train_pre.csv", index = False)
test_df.to_csv("nsmc_ratings_test_pre.csv", index = False)

In [4]:
train_df = pd.read_csv("./nsmc_ratings_train_pre.csv")
test_df = pd.read_csv("./nsmc_ratings_test_pre.csv")

In [5]:
type(train_df.loc[0, "token"])

str

In [6]:
# 위 타입이 문자열이라면 리스트로 바꾸는 법
train_df["token"] = train_df["token"].map(lambda x: eval(x))
test_df["token"] = test_df["token"].map(lambda x: eval(x))

In [7]:
train_df.shape, test_df.shape

((143660, 4), (48403, 4))

train_df["token"] = train_df["token"].map(lambda x: [i for i in x if len(i) > 1])
test_df["token"] = test_df["token"].map(lambda x: [i for i in x if len(i) > 1])

In [8]:
train_df.shape, test_df.shape

((143660, 4), (48403, 4))

In [9]:
train_df.head()

Unnamed: 0,id,document,label,token
0,9976970,아 더빙 진짜 짜증나네요 목소리,0,"[아, 더빙, 진짜, 짜증나다, 목소리]"
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1,"[흠, 포스터, 보고, 초딩, 영화, 줄, 오버, 연기, 조차, 가볍다, 않다]"
2,10265843,너무재밓었다그래서보는것을추천한다,0,"[너, 무재, 밓었, 다그, 래서, 보다, 추천, 한, 다]"
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0,"[교도소, 이야기, 구먼, 솔직하다, 재미, 는, 없다, 평점, 조정]"
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1,"[사이, 몬페, 그, 의, 익살스럽다, 연기, 가, 돋보이다, 영화, 스파이더맨, ..."


In [10]:
train_df = train_df[train_df["token"]. map(lambda x: len(x) > 0)]
test_df = test_df[test_df["token"]. map(lambda x: len(x) > 0)]

In [11]:
train_df.shape, test_df.shape

((143660, 4), (48403, 4))

# 정수인코딩

In [12]:
tokenizer = Tokenizer()

In [13]:
# 단어 집합 생성
# 등장 빈도 수가 높은 순서대로 정수값 부여
tokenizer.fit_on_texts(train_df["token"])

In [14]:
# 단어 집합
tokenizer.word_index

{'이': 1,
 '영화': 2,
 '보다': 3,
 '하다': 4,
 '의': 5,
 '에': 6,
 '가': 7,
 '을': 8,
 '도': 9,
 '들': 10,
 '는': 11,
 '를': 12,
 '은': 13,
 '없다': 14,
 '이다': 15,
 '있다': 16,
 '좋다': 17,
 '너무': 18,
 '다': 19,
 '정말': 20,
 '한': 21,
 '되다': 22,
 '적': 23,
 '만': 24,
 '재밌다': 25,
 '같다': 26,
 '진짜': 27,
 '으로': 28,
 '로': 29,
 '아니다': 30,
 '않다': 31,
 '점': 32,
 '에서': 33,
 '만들다': 34,
 '과': 35,
 '나오다': 36,
 '연기': 37,
 '것': 38,
 '평점': 39,
 '내': 40,
 '최고': 41,
 '그': 42,
 '나': 43,
 '안': 44,
 '인': 45,
 '스토리': 46,
 '생각': 47,
 '못': 48,
 '왜': 49,
 '드라마': 50,
 '게': 51,
 '사람': 52,
 '감동': 53,
 '보고': 54,
 '이렇다': 55,
 '고': 56,
 '말': 57,
 '아깝다': 58,
 '더': 59,
 '배우': 60,
 '때': 61,
 'ㅋㅋ': 62,
 '와': 63,
 '아': 64,
 '감독': 65,
 '거': 66,
 '그냥': 67,
 '요': 68,
 '재미있다': 69,
 '재미': 70,
 '시간': 71,
 '내용': 72,
 '까지': 73,
 '뭐': 74,
 '중': 75,
 '주다': 76,
 '좀': 77,
 '자다': 78,
 '하고': 79,
 '지루하다': 80,
 '수': 81,
 '재미없다': 82,
 '네': 83,
 '쓰레기': 84,
 '모르다': 85,
 '가다': 86,
 '들다': 87,
 '그렇다': 88,
 '싶다': 89,
 '지': 90,
 '작품': 91,
 '사랑': 92,
 '알다': 93,
 '하나': 94

In [15]:
# 단어 등장 수
tokenizer.word_counts

OrderedDict([('아', 4121),
             ('더빙', 572),
             ('진짜', 8288),
             ('짜증나다', 1002),
             ('목소리', 374),
             ('흠', 246),
             ('포스터', 572),
             ('보고', 4653),
             ('초딩', 422),
             ('영화', 50172),
             ('줄', 1240),
             ('오버', 142),
             ('연기', 6326),
             ('조차', 242),
             ('가볍다', 360),
             ('않다', 7718),
             ('너', 670),
             ('무재', 69),
             ('밓었', 1),
             ('다그', 75),
             ('래서', 20),
             ('보다', 40991),
             ('추천', 1180),
             ('한', 9615),
             ('다', 10077),
             ('교도소', 16),
             ('이야기', 2171),
             ('구먼', 11),
             ('솔직하다', 1199),
             ('재미', 3854),
             ('는', 16911),
             ('없다', 15519),
             ('평점', 6245),
             ('조정', 40),
             ('사이', 222),
             ('몬페', 2),
             ('그', 5667),
             ('의', 3073

In [16]:
total_cnt = len(tokenizer.word_index)
total_cnt

43770

- 사용 단어 수를 지정하여 토큰화
    - tokenizer = Tokenizer(num_words = 원하는 단어 수)
    - tokenizer.fit_on_texts(train_df["token"])

In [17]:
x_train = tokenizer.texts_to_sequences(train_df["token"])
x_test = tokenizer.texts_to_sequences(test_df["token"])

In [18]:
y_train = train_df["label"]
y_test = test_df["label"]

In [19]:
a = []
for i in range(len(x_train)):
    a.append(len(x_train[i]))
np.mean(a), np.median(a), np.max(a), np.min(a)

(13.234950577753027, 10.0, 78, 1)

In [20]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify = y_train, test_size = 0.2, random_state = 0)

In [24]:
train_seq = pad_sequences(x_train, maxlen = 60)
val_seq = pad_sequences(x_val, maxlen = 60)
test_seq = pad_sequences(x_test, maxlen = 60)

In [26]:
model7 = keras.Sequential()
model7.add(keras.layers.Embedding(4377, 256, input_shape = (60,)))
model7.add(keras.layers.Dropout(0.3))

model7.add(keras.layers.GRU(256, dropout = 0.3))

model7.add(keras.layers.Dense(10, activation = "relu"))
model7.add(keras.layers.BatchNormalization())
model7.add(keras.layers.Dense(1, activation = "sigmoid"))

rmsprop = keras.optimizers.RMSprop(learning_rate = 0.0001)
model7.compile(optimizer = rmsprop, loss = "binary_crossentropy", metrics = ["accuracy"])

early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

history7 = model7.fit(train_seq, y_train, batch_size = 64, epochs = 1000,
                    validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 26ms/step - accuracy: 0.6801 - loss: 0.5742 - val_accuracy: 0.8027 - val_loss: 0.4225
Epoch 2/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 24ms/step - accuracy: 0.7964 - loss: 0.4344 - val_accuracy: 0.8120 - val_loss: 0.4048
Epoch 3/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 25ms/step - accuracy: 0.8111 - loss: 0.4102 - val_accuracy: 0.8180 - val_loss: 0.3990
Epoch 4/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 25ms/step - accuracy: 0.8155 - loss: 0.4011 - val_accuracy: 0.8159 - val_loss: 0.3982
Epoch 5/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 24ms/step - accuracy: 0.8208 - loss: 0.3903 - val_accuracy: 0.8206 - val_loss: 0.3930
Epoch 6/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 25ms/step - accuracy: 0.8246 - loss: 0.3868 - val_accuracy: 0.8238 - val_loss:

In [27]:
model7.evaluate(test_seq, y_test)

[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.8382 - loss: 0.3655


[0.368745356798172, 0.8358779549598694]

In [28]:
train_seq = pad_sequences(x_train, maxlen = 65)
val_seq = pad_sequences(x_val, maxlen = 65)
test_seq = pad_sequences(x_test, maxlen = 65)

In [29]:
model8 = keras.Sequential()
model8.add(keras.layers.Embedding(4377, 256, input_shape = (65,)))
model8.add(keras.layers.Dropout(0.4))

model8.add(keras.layers.GRU(256, dropout = 0.4))

model8.add(keras.layers.Dense(10, activation = "relu"))
model8.add(keras.layers.BatchNormalization())
model8.add(keras.layers.Dense(1, activation = "sigmoid"))

rmsprop = keras.optimizers.RMSprop(learning_rate = 0.0001)
model8.compile(optimizer = rmsprop, loss = "binary_crossentropy", metrics = ["accuracy"])

early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

history8 = model8.fit(train_seq, y_train, batch_size = 32, epochs = 1000,
                    validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m3592/3592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 59ms/step - accuracy: 0.6552 - loss: 0.6001 - val_accuracy: 0.7666 - val_loss: 0.4687
Epoch 2/1000
[1m3592/3592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 59ms/step - accuracy: 0.7936 - loss: 0.4400 - val_accuracy: 0.8005 - val_loss: 0.4203
Epoch 3/1000
[1m3592/3592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 64ms/step - accuracy: 0.8101 - loss: 0.4134 - val_accuracy: 0.8222 - val_loss: 0.3901
Epoch 4/1000
[1m3592/3592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 51ms/step - accuracy: 0.8186 - loss: 0.3998 - val_accuracy: 0.8197 - val_loss: 0.3912
Epoch 5/1000
[1m3592/3592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 51ms/step - accuracy: 0.8250 - loss: 0.3916 - val_accuracy: 0.8281 - val_loss: 0.3815
Epoch 6/1000
[1m3592/3592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 52ms/step - accuracy: 0.8286 - loss: 0.3850 - val_accuracy: 0.8303 - val

[1m3592/3592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 60ms/step - accuracy: 0.8767 - loss: 0.2957 - val_accuracy: 0.8505 - val_loss: 0.3460


In [30]:
model8.evaluate(test_seq, y_test)

[1m1513/1513[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 17ms/step - accuracy: 0.8520 - loss: 0.3404


[0.3422556221485138, 0.8503398299217224]

In [None]:
model9 = keras.Sequential()
model9.add(keras.layers.Embedding(4377, 512, input_shape = (65,)))
model9.add(keras.layers.Dropout(0.4))

model9.add(keras.layers.GRU(512, dropout = 0.4))

model9.add(keras.layers.Dense(10, activation = "relu"))
model9.add(keras.layers.BatchNormalization())
model9.add(keras.layers.Dense(1, activation = "sigmoid"))

rmsprop = keras.optimizers.RMSprop(learning_rate = 0.0001)
model9.compile(optimizer = rmsprop, loss = "binary_crossentropy", metrics = ["accuracy"])

early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

history9 = model9.fit(train_seq, y_train, batch_size = 64, epochs = 1000,
                    validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m762s[0m 423ms/step - accuracy: 0.6944 - loss: 0.5640 - val_accuracy: 0.7999 - val_loss: 0.4217
Epoch 2/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m741s[0m 413ms/step - accuracy: 0.7992 - loss: 0.4246 - val_accuracy: 0.8203 - val_loss: 0.3915
Epoch 3/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m753s[0m 419ms/step - accuracy: 0.8207 - loss: 0.3920 - val_accuracy: 0.8255 - val_loss: 0.3818
Epoch 4/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m727s[0m 405ms/step - accuracy: 0.8249 - loss: 0.3863 - val_accuracy: 0.8288 - val_loss: 0.3756
Epoch 5/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m777s[0m 432ms/step - accuracy: 0.8314 - loss: 0.3757 - val_accuracy: 0.8303 - val_loss: 0.3741
Epoch 6/1000
[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m779s[0m 434ms/step - accuracy: 0.8361 - loss: 0.3677 - val_accuracy: 0.8363

In [None]:
model9.evaluate(test_seq, y_test)

In [60]:
model = keras.Sequential()
model.add(keras.layers.Embedding(4377, 128, input_shape = (13,)))
model.add(keras.layers.Dropout(0.3))

model.add(keras.layers.GRU(128, dropout = 0.4))

model.add(keras.layers.Dense(10, activation = "relu"))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(1, activation = "sigmoid"))

rmsprop = keras.optimizers.RMSprop(learning_rate = 0.001)
model.compile(optimizer = rmsprop, loss = "binary_crossentropy", metrics = ["accuracy"])

early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

history = model.fit(train_seq, y_train, batch_size = 32, epochs = 1000,
                    validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.7525 - loss: 0.4932 - val_accuracy: 0.8086 - val_loss: 0.4161
Epoch 2/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8154 - loss: 0.4068 - val_accuracy: 0.8191 - val_loss: 0.3961
Epoch 3/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8244 - loss: 0.3899 - val_accuracy: 0.8216 - val_loss: 0.3879
Epoch 4/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8339 - loss: 0.3732 - val_accuracy: 0.8253 - val_loss: 0.3834
Epoch 5/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8368 - loss: 0.3697 - val_accuracy: 0.8270 - val_loss: 0.3816
Epoch 6/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.8415 - loss: 0.3614 - val_accuracy: 0.8276 - val_loss: 0.377

In [61]:
model.evaluate(test_seq, y_test)

[1m1509/1509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8309 - loss: 0.3697


[0.3741733729839325, 0.8292374610900879]

In [64]:
model3 = keras.Sequential()
model3.add(keras.layers.Embedding(4212, 256, input_shape = (13,)))
model3.add(keras.layers.Dropout(0.4))

model3.add(keras.layers.LSTM(256, dropout = 0.4))

model3.add(keras.layers.Dense(10, activation = "relu"))
model3.add(keras.layers.BatchNormalization())
model3.add(keras.layers.Dense(1, activation = "sigmoid"))

adam = keras.optimizers.Adam(learning_rate =0.00005)
model3.compile(loss = "binary_crossentropy", optimizer = adam, metrics = ["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 15, restore_best_weights = True)

history = model3.fit(train_seq, y_train, batch_size = 64, epochs = 1000, validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m1792/1792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 13ms/step - accuracy: 0.6504 - loss: 0.6024 - val_accuracy: 0.8064 - val_loss: 0.4250
Epoch 2/1000
[1m1792/1792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 13ms/step - accuracy: 0.7832 - loss: 0.4471 - val_accuracy: 0.8166 - val_loss: 0.4034
Epoch 3/1000
[1m1792/1792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 13ms/step - accuracy: 0.8043 - loss: 0.4173 - val_accuracy: 0.8188 - val_loss: 0.3952
Epoch 4/1000
[1m1792/1792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 13ms/step - accuracy: 0.8165 - loss: 0.3993 - val_accuracy: 0.8205 - val_loss: 0.3920
Epoch 5/1000
[1m1792/1792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 13ms/step - accuracy: 0.8252 - loss: 0.3901 - val_accuracy: 0.8191 - val_loss: 0.3913
Epoch 6/1000
[1m1792/1792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 13ms/step - accuracy: 0.8263 - loss: 0.3826 - val_accuracy: 0.8216 - val_loss:

In [65]:
model3.evaluate(test_seq, y_test)

[1m1509/1509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8246 - loss: 0.3806


[0.3834826648235321, 0.8235806226730347]

In [66]:
model4 = keras.Sequential()
model4.add(keras.layers.Embedding(8424, 128, input_shape = (13,)))
model4.add(keras.layers.Dropout(0.4))

model4.add(keras.layers.LSTM(256, dropout = 0.4))

model4.add(keras.layers.Dense(10, activation = "relu"))
model4.add(keras.layers.BatchNormalization())
model4.add(keras.layers.Dense(1, activation = "sigmoid"))

adam = keras.optimizers.Adam(learning_rate =0.00005)
model4.compile(loss = "binary_crossentropy", optimizer = adam, metrics = ["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

history = model4.fit(train_seq, y_train, batch_size = 16, epochs = 1000, validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m7166/7166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 6ms/step - accuracy: 0.6685 - loss: 0.5888 - val_accuracy: 0.8085 - val_loss: 0.4177
Epoch 2/1000
[1m7166/7166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 6ms/step - accuracy: 0.7915 - loss: 0.4444 - val_accuracy: 0.8194 - val_loss: 0.3941
Epoch 3/1000
[1m7166/7166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 6ms/step - accuracy: 0.8126 - loss: 0.4137 - val_accuracy: 0.8173 - val_loss: 0.3947
Epoch 4/1000
[1m7166/7166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 6ms/step - accuracy: 0.8222 - loss: 0.3983 - val_accuracy: 0.8260 - val_loss: 0.3842
Epoch 5/1000
[1m7166/7166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 6ms/step - accuracy: 0.8243 - loss: 0.3928 - val_accuracy: 0.8264 - val_loss: 0.3829
Epoch 6/1000
[1m7166/7166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 6ms/step - accuracy: 0.8311 - loss: 0.3812 - val_accuracy: 0.8280 - val_loss: 0.382

In [67]:
model4.evaluate(test_seq, y_test)

[1m1509/1509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8313 - loss: 0.3734


[0.37741202116012573, 0.8287401795387268]

In [68]:
model5 = keras.Sequential()
model5.add(keras.layers.Embedding(8424, 128, input_shape = (13,)))
model5.add(keras.layers.Dropout(0.4))

model5.add(keras.layers.GRU(256, dropout = 0.4))

model5.add(keras.layers.Dense(10, activation = "relu"))
model5.add(keras.layers.BatchNormalization())
model5.add(keras.layers.Dense(1, activation = "sigmoid"))

adam = keras.optimizers.Adam(learning_rate =0.000005)
model5.compile(loss = "binary_crossentropy", optimizer = adam, metrics = ["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

history = model5.fit(train_seq, y_train, batch_size = 32, epochs = 1000, validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 8ms/step - accuracy: 0.5280 - loss: 0.7040 - val_accuracy: 0.6075 - val_loss: 0.6598
Epoch 2/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 8ms/step - accuracy: 0.6039 - loss: 0.6580 - val_accuracy: 0.6879 - val_loss: 0.5971
Epoch 3/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 8ms/step - accuracy: 0.6470 - loss: 0.6199 - val_accuracy: 0.7244 - val_loss: 0.5493
Epoch 4/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 8ms/step - accuracy: 0.6821 - loss: 0.5865 - val_accuracy: 0.7511 - val_loss: 0.5145
Epoch 5/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 8ms/step - accuracy: 0.7081 - loss: 0.5560 - val_accuracy: 0.7685 - val_loss: 0.4888
Epoch 6/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 8ms/step - accuracy: 0.7267 - loss: 0.5327 - val_accuracy: 0.7804 - val_loss: 0.469

Epoch 50/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 8ms/step - accuracy: 0.8348 - loss: 0.3724 - val_accuracy: 0.8259 - val_loss: 0.3824
Epoch 51/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 8ms/step - accuracy: 0.8333 - loss: 0.3738 - val_accuracy: 0.8264 - val_loss: 0.3834
Epoch 52/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 8ms/step - accuracy: 0.8356 - loss: 0.3707 - val_accuracy: 0.8259 - val_loss: 0.3822
Epoch 53/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 8ms/step - accuracy: 0.8349 - loss: 0.3737 - val_accuracy: 0.8268 - val_loss: 0.3825
Epoch 54/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 8ms/step - accuracy: 0.8350 - loss: 0.3705 - val_accuracy: 0.8264 - val_loss: 0.3820
Epoch 55/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 8ms/step - accuracy: 0.8355 - loss: 0.3700 - val_accuracy: 0.8268 - val_loss:

In [69]:
model5.evaluate(test_seq, y_test)

[1m1509/1509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8301 - loss: 0.3756


[0.37970301508903503, 0.827704131603241]

In [70]:
model6 = keras.Sequential()
model6.add(keras.layers.Embedding(4212, 128, input_shape = (13,)))
model6.add(keras.layers.Dropout(0.4))

model6.add(keras.layers.GRU(128, dropout = 0.4))

model6.add(keras.layers.Dense(10, activation = "relu"))
model6.add(keras.layers.BatchNormalization())
model6.add(keras.layers.Dense(1, activation = "sigmoid"))

adam = keras.optimizers.Adam(learning_rate =0.00001)
model6.compile(loss = "binary_crossentropy", optimizer = adam, metrics = ["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

history = model6.fit(train_seq, y_train, batch_size = 32, epochs = 1000, validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - accuracy: 0.5089 - loss: 0.7168 - val_accuracy: 0.6144 - val_loss: 0.6561
Epoch 2/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.6077 - loss: 0.6507 - val_accuracy: 0.7080 - val_loss: 0.5693
Epoch 3/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 5ms/step - accuracy: 0.6752 - loss: 0.5910 - val_accuracy: 0.7567 - val_loss: 0.5109
Epoch 4/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.7178 - loss: 0.5463 - val_accuracy: 0.7789 - val_loss: 0.4736
Epoch 5/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.7439 - loss: 0.5113 - val_accuracy: 0.7935 - val_loss: 0.4494
Epoch 6/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.7610 - loss: 0.4863 - val_accuracy: 0.8004 - val_loss: 0.432

Epoch 50/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.8356 - loss: 0.3666 - val_accuracy: 0.8229 - val_loss: 0.3864
Epoch 51/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.8339 - loss: 0.3693 - val_accuracy: 0.8223 - val_loss: 0.3867
Epoch 52/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.8351 - loss: 0.3678 - val_accuracy: 0.8231 - val_loss: 0.3865
Epoch 53/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.8362 - loss: 0.3657 - val_accuracy: 0.8231 - val_loss: 0.3869
Epoch 54/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.8348 - loss: 0.3670 - val_accuracy: 0.8235 - val_loss: 0.3867
Epoch 55/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.8355 - loss: 0.3672 - val_accuracy: 0.8238 - val_loss:

In [71]:
model6.evaluate(test_seq, y_test)

[1m1509/1509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8251 - loss: 0.3787


[0.3823125660419464, 0.8238707184791565]

In [72]:
model7 = keras.Sequential()
model7.add(keras.layers.Embedding(4212, 64, input_shape = (13,)))
model7.add(keras.layers.Dropout(0.3))

model7.add(keras.layers.GRU(64, dropout = 0.3))

model7.add(keras.layers.Dense(10, activation = "relu"))
model7.add(keras.layers.BatchNormalization())
model7.add(keras.layers.Dense(1, activation = "sigmoid"))

rmsprop = keras.optimizers.RMSprop(learning_rate = 0.0001)
model7.compile(optimizer = rmsprop, loss = "binary_crossentropy", metrics = ["accuracy"])

early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

history7 = model7.fit(train_seq, y_train, batch_size = 32, epochs = 1000,
                    validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.6509 - loss: 0.6038 - val_accuracy: 0.7919 - val_loss: 0.4399
Epoch 2/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.7839 - loss: 0.4530 - val_accuracy: 0.8033 - val_loss: 0.4220
Epoch 3/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.7979 - loss: 0.4325 - val_accuracy: 0.8070 - val_loss: 0.4156
Epoch 4/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8077 - loss: 0.4232 - val_accuracy: 0.8059 - val_loss: 0.4168
Epoch 5/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8071 - loss: 0.4198 - val_accuracy: 0.8083 - val_loss: 0.4120
Epoch 6/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8114 - loss: 0.4130 - val_accuracy: 0.8122 - val_loss: 0.407

Epoch 50/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8417 - loss: 0.3580 - val_accuracy: 0.8280 - val_loss: 0.3778
Epoch 51/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8434 - loss: 0.3566 - val_accuracy: 0.8271 - val_loss: 0.3785
Epoch 52/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8424 - loss: 0.3592 - val_accuracy: 0.8279 - val_loss: 0.3756
Epoch 53/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8434 - loss: 0.3577 - val_accuracy: 0.8280 - val_loss: 0.3748
Epoch 54/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8436 - loss: 0.3577 - val_accuracy: 0.8286 - val_loss: 0.3745
Epoch 55/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.8439 - loss: 0.3574 - val_accuracy: 0.8277 - val_loss:

In [73]:
model7.evaluate(test_seq, y_test)

[1m1509/1509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 829us/step - accuracy: 0.8335 - loss: 0.3695


[0.37350139021873474, 0.8318275809288025]

In [56]:
train_seq = pad_sequences(x_train, maxlen = 33)
val_seq = pad_sequences(x_val, maxlen = 33)
test_seq = pad_sequences(x_test, maxlen = 33)

model2 = keras.Sequential()
model2.add(keras.layers.Embedding(5000, 32, input_shape = (33,)))

model2.add(keras.layers.GRU(64))

model2.add(keras.layers.Dense(1, activation = "sigmoid"))

rmsprop = keras.optimizers.RMSprop(learning_rate = 0.001)
model2.compile(optimizer = rmsprop, loss = "binary_crossentropy", metrics = ["accuracy"])

early_stopping_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

history2 = model2.fit(train_seq, y_train, batch_size = 32, epochs = 1000,
                    validation_data = (val_seq, y_val), callbacks = [early_stopping_cb])

Epoch 1/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.7642 - loss: 0.4716 - val_accuracy: 0.8210 - val_loss: 0.3902
Epoch 2/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.8306 - loss: 0.3734 - val_accuracy: 0.8297 - val_loss: 0.3735
Epoch 3/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.8393 - loss: 0.3573 - val_accuracy: 0.8298 - val_loss: 0.3737
Epoch 4/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.8472 - loss: 0.3434 - val_accuracy: 0.8347 - val_loss: 0.3636
Epoch 5/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.8546 - loss: 0.3312 - val_accuracy: 0.8360 - val_loss: 0.3619
Epoch 6/1000
[1m3583/3583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.8572 - loss: 0.3269 - val_accuracy: 0.8380 - val_loss: 0.362

In [57]:
model2.evaluate(test_seq, y_test)

[1m1509/1509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8406 - loss: 0.3578


[0.3619956970214844, 0.8389349579811096]