In [2]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [5]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Google Drive와 연동
from google.colab import drive
drive.mount('/content/drive')

# 데이터 로드 및 병합
files = ['/content/drive/MyDrive/지인_categorized.csv',
         '/content/drive/MyDrive/택배_categorized.csv',
         '/content/drive/MyDrive/이벤트_categorized.csv',
         '/content/drive/MyDrive/채용_categorized.csv',
         '/content/drive/MyDrive/금융_categorized.csv',
         '/content/drive/MyDrive/기타_categorized.csv',
         '/content/drive/MyDrive/기관_categorized.csv']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
dfs = [pd.read_csv(file) for file in files]
df = pd.concat(dfs, ignore_index=True)

# 전처리: 불용어 리스트와 함께 처리
from konlpy.tag import Okt
okt = Okt()
stopwords = ['은', '는', '이', '가', '을', '를', '에', '의', '도', '으로', '하다']  # 예시 불용어 리스트

def preprocess_text(text):
    tokens = okt.morphs(text, stem=True)  # 형태소 분석
    tokens = [word for word in tokens if word not in stopwords]  # 불용어 제거
    return ' '.join(tokens)

df['processed_text'] = df['v2'].apply(preprocess_text)

# 라벨 데이터 준비
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['category'])  # 카테고리 값을 숫자로 변환
y = df['label']

# 텍스트 데이터를 시퀀스로 변환
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['processed_text'])
X = tokenizer.texts_to_sequences(df['processed_text'])

# 시퀀스 패딩
X = pad_sequences(X, maxlen=max_len)

# 학습/검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM 모델 정의
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # 이진 분류를 위한 시그모이드 활성화 함수

# 모델 컴파일
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# 모델 학습
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2, callbacks=[es])

# 예측 및 평가
loss, accuracy = model.evaluate(X_test, y_test)
print(f"테스트 정확도: {accuracy:.4f}")



Epoch 1/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 430ms/step - accuracy: 0.0241 - loss: -68.7473 - val_accuracy: 0.0251 - val_loss: -192.4395
Epoch 2/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 422ms/step - accuracy: 0.0427 - loss: -234.1599 - val_accuracy: 0.0819 - val_loss: -343.4933
Epoch 3/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 423ms/step - accuracy: 0.0838 - loss: -381.9167 - val_accuracy: 0.0937 - val_loss: -497.1013
Epoch 4/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 423ms/step - accuracy: 0.0907 - loss: -535.4150 - val_accuracy: 0.0937 - val_loss: -650.9789
Epoch 5/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 434ms/step - accuracy: 0.0912 - loss: -686.7002 - val_accuracy: 0.0962 - val_loss: -791.7276
Epoch 6/10
[1m455/455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 435ms/step - accuracy: 0.0923 - loss: -829.9587 - val_accuracy

In [12]:
# 실제 텍스트 입력 테스트
def predict_text_lstm(text, threshold=0.5):
    # 입력 텍스트 전처리
    processed = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([processed])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)

    # 예측 확률 계산
    prob = model.predict(padded_sequence)[0][0]

    # 결과 출력
    if prob >= threshold:
        print(f"스미싱 의심: {prob * 100:.2f}% 확률로 스미싱")
    else:
        print(f"스미싱 아님: {prob * 100:.2f}% 확률로 스미싱 아님")

# 사용자로부터 텍스트 입력받기
user_input = input("확인할 메시지를 입력하세요: ")
predict_text_lstm(user_input)

확인할 메시지를 입력하세요: 나 아직 밥 안먹었는데 이 수업 끝나고 밥 먹자
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
스미싱 의심: 100.00% 확률로 스미싱
