In [1]:
# @title 01.Tải Thư Viện
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# @title 02.Tải Dataset
!gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: /content/2cls_spam_text_cls.csv
100% 486k/486k [00:00<00:00, 14.3MB/s]


In [3]:
# @title 03.Đọc File
DATASET_PATH = '/content/2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)

messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

In [4]:
# @title 04.Chuẩn bị bộ dữ liệu

In [5]:
# @title 04.1.Xử lý dữ liệu đặc trưng

# trả về chữ thường
def lowercase(text):
  return text.lower()

# lọc ký tự không cần thiêt
def punctuation_removal(text):
  translator = str.maketrans('','', string.punctuation)
  return text.translate(translator)

# tách câu thành list các từ độc lập
def tokenize(text):
  return text.split()

# trong tiếng anh, bỏ các từ không cần thiêt( mạo từ, to be,...)
def remvove_stopwords(tokens):
  stop_words = nltk.corpus.stopwords.words('english')
  return [token for token in tokens if token not in stop_words]

# giảm tải độ phức tạp như bỏ các từ số nhiều, chia động từ,...
def stemming(tokens):
  stemmer = nltk.PorterStemmer()
  return [stemmer.stem(token) for token in tokens]

def preprocess_text(text):
  text = lowercase(text)
  text = punctuation_removal(text)
  tokens = tokenize(text)
  tokens = remvove_stopwords(tokens)
  tokens = stemming(tokens)
  return tokens

In [6]:
messages = [preprocess_text(message) for message in messages]

In [9]:
def create_dictionary(messages):
  dictionary = []
  for tokens in messages:
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)
  return dictionary

def create_features(tokens, dictionary):
  features = np.zeros(len(dictionary))
  for token in tokens:
    if token in dictionary:
      features[dictionary.index(token)] += 1
  return features

dictionary = create_dictionary(messages)
X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [10]:
# @title 4.2. Xử lý dữ liệu nhãn

le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')

Classes: ['ham' 'spam']


In [11]:
# @title 4.3. Chia dữ liệu train/val/test

VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(
    X,y,
    test_size = VAL_SIZE,
    shuffle=True,
    random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(
  X_train, y_train,
  test_size = TEST_SIZE,
  shuffle=True,
  random_state=SEED)


In [12]:
# @title 5. Huấn luyện mô hình

%%time
model = GaussianNB()
print('Start training...')
model = model.fit(X_train, y_train)
print('Training completed!')

Start training...
Training completed!
CPU times: user 423 ms, sys: 190 ms, total: 614 ms
Wall time: 638 ms


In [13]:
# @title 6. Đánh giá mô hình

y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Val accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

Val accuracy: 0.8816143497757848
Test accuracy: 0.8620071684587813


In [15]:
# @title 7. Thực hiện dự đoán

def predict(text, model, dictionary):
  processed_text = preprocess_text(text)
  features = create_features(text, dictionary)
  features = np.array(features).reshape(1, -1)
  prediction = model.predict(features)
  prediction_cls = le.inverse_transform(prediction)[0]
  return prediction_cls

In [21]:
test_input = 'I am ham spam'
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: ham
