In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Flatten

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# encoding은 보통 utf-8, cp949 로 하면되지만 이번 파일은 latin1
spam_data = pd.read_csv('/content/drive/MyDrive/NLP/data/spam.csv', encoding='latin1')
# 3, 4, 5열 삭제 후 컬럼명 변경
spam_data = spam_data.dropna(axis=1)
spam_data.columns = ["label", "mail"]
# ham, spam 숫자로 변경
spam_data['label'] = spam_data['label'].replace('spam', 1)
spam_data['label'] = spam_data['label'].replace('ham', 0)
# 단어 아니면 삭제
spam_data['mail'] = spam_data['mail'].str.replace("[^\w]", " ")
# 혹시나 공백이 있으면
spam_data['mail'] = spam_data['mail'].replace('', np.nan)
spam_data['label'] = spam_data['label'].replace('', np.nan)
# 결측치 있으면 모두 제거
spam_data = spam_data.dropna(how='any')

print(spam_data[:5])
print("# preprocessing done")

# test/train 스플릿하고
mail_train, mail_test, y_train, y_test = train_test_split(spam_data['mail'], spam_data['label'], test_size=0.2, shuffle=False)

print('# split done')

stopwords = ['a', 'an']

# 토큰화 진행
X_train = []
for stc in mail_train:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_train.append(token)

X_test = []
for stc in mail_test:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_test.append(token)

print('# tokenization done')

   label                                               mail
0      0  Go until jurong point  crazy   Available only ...
1      0                      Ok lar    Joking wif u oni   
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor    U c already then say   
4      0  Nah I don t think he goes to usf  he lives aro...
# preprocessing done
# split done
# tokenization done


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

# X_train 단어들을 토대로 정수 인덱스 설정
# 빈도수가 높은 것부터 4000개만 정수 인덱스로 변환하겠다!
tokenizer = Tokenizer(7792)
tokenizer.fit_on_texts(X_train)

# 위에서 설정된 정수 인덱스를 토대로 변환
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

print('# int_encoding done')

# int_encoding done


In [4]:
print(len(tokenizer.word_index))

low_count = 0
for word, word_count in tokenizer.word_counts.items():
    if word_count == 1:
        low_count += 1
print(low_count)

7792
4030


In [5]:
max_length = 0
for data in X_train:
    if max_length < len(data):
        max_length = len(data)
print(max_length)

189


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 50
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [13]:
# 스팸 데이터에 적용한 CNN 모델
model = Sequential()
model.add(Embedding(7792, 32))
model.add(Conv1D(256, 3, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [14]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f1fe6ae36d8>

In [15]:
sentence = input()
# 토큰화
token_stc = sentence.split()
# 정수 인코딩
encode_stc = tokenizer.texts_to_sequences([token_stc])
# 패딩
pad_stc = pad_sequences(encode_stc, maxlen = 50)

score = model.predict(pad_stc)
print(score)

Ok this is sale
[[0.00022846]]
