## **0. Download dataset from drive**

In [29]:
!gdown --id 1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: d:\Onedrive2024\OneDrive\1.0 DS & AI\AIO2024\AIO-Exercise\Module_02\Project_Text_Classification\2cls_spam_text_cls.csv

  0%|          | 0.00/486k [00:00<?, ?B/s]
100%|██████████| 486k/486k [00:00<00:00, 2.88MB/s]
100%|██████████| 486k/486k [00:00<00:00, 2.86MB/s]


## **1. Import libraries**

In [30]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tienhyu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tienhyu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tienhyu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## **2. Read dataset**

In [31]:
DATASET_PATH = '2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)

In [32]:
# read
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [33]:
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

## **3. Data Processing**

### **3.1. Label data processing**

In [34]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


### **3.2. Mail content preprocessing**

In [35]:
# Convert all text to lowercase
def lowercase(text):
    return text.lower()

In [36]:
# Test
INPUT_TEXT = "The cat's toys are scattered everywhere."
lowercase(INPUT_TEXT)

"the cat's toys are scattered everywhere."

In [37]:
# Eliminate all punctuation marks
def punctuation_removal(text):
    translator = str.maketrans('', '', string.punctuation)

    return text.translate(translator)

In [38]:
# Test
INPUT_TEXT = "The cat's toys are scattered everywhere."
INPUT_TEXT = lowercase(INPUT_TEXT)
punctuation_removal(INPUT_TEXT)

'the cats toys are scattered everywhere'

In [39]:
# Split the text into individual words (tokens)
def tokenize(text):
    return nltk.word_tokenize(text)

In [40]:
# Test
INPUT_TEXT = "The cat's toys are scattered everywhere."
INPUT_TEXT = lowercase(INPUT_TEXT)
INPUT_TEXT = punctuation_removal(INPUT_TEXT)
tokenize(INPUT_TEXT)

['the', 'cats', 'toys', 'are', 'scattered', 'everywhere']

In [41]:
# Filter out common words that don't carry significant meaning
def remove_stopwords(tokens):
    stop_words = nltk.corpus.stopwords.words('english')

    return [token for token in tokens if token not in stop_words]

In [42]:
# Test
INPUT_TEXT = "The cat's toys are scattered everywhere."
INPUT_TEXT = lowercase(INPUT_TEXT)
INPUT_TEXT = punctuation_removal(INPUT_TEXT)
INPUT_TEXT = tokenize(INPUT_TEXT)
remove_stopwords(INPUT_TEXT)

['cats', 'toys', 'scattered', 'everywhere']

In [43]:
# Reduces words to their root form, grouping similar words together

def stemming(tokens):
    stemmer = nltk.PorterStemmer()

    return [stemmer.stem(token) for token in tokens]

In [44]:
# Test
INPUT_TEXT = "The cat's toys are scattered everywhere."
INPUT_TEXT = lowercase(INPUT_TEXT)
INPUT_TEXT = punctuation_removal(INPUT_TEXT)
INPUT_TEXT = tokenize(INPUT_TEXT)
INPUT_TEXT = remove_stopwords(INPUT_TEXT)
stemming(INPUT_TEXT)

['cat', 'toy', 'scatter', 'everywher']

In [45]:
# Mail content data preprocessing
def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)

    return tokens

In [46]:
# Test
INPUT_TEXT = "The cat's toys are scattered everywhere."
preprocess_text(INPUT_TEXT)

['cat', 'toy', 'scatter', 'everywher']

In [47]:
messages = [preprocess_text(message) for message in messages]

In [48]:
messages

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'u', 'oni'],
 ['free',
  'entri',
  '2',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receiv',
  'entri',
  'questionstd',
  'txt',
  'ratetc',
  'appli',
  '08452810075over18'],
 ['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say'],
 ['nah', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  '3',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  '£150',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  '9',
  'copi',
  '

In [49]:
# Build vocabulary
def create_dictionary(messages):
    dictionary = []

    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary

In [50]:
# Create feature
def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))

    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1

    return features

In [51]:
dictionary = create_dictionary(messages)
X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [52]:
# Test
dictionary

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat',
 'ok',
 'lar',
 'joke',
 'wif',
 'u',
 'oni',
 'free',
 'entri',
 '2',
 'wkli',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkt',
 '21st',
 'may',
 '2005',
 'text',
 '87121',
 'receiv',
 'questionstd',
 'txt',
 'ratetc',
 'appli',
 '08452810075over18',
 'dun',
 'say',
 'earli',
 'hor',
 'c',
 'alreadi',
 'nah',
 'dont',
 'think',
 'goe',
 'usf',
 'live',
 'around',
 'though',
 'freemsg',
 'hey',
 'darl',
 '3',
 'week',
 'word',
 'back',
 'id',
 'like',
 'fun',
 'still',
 'tb',
 'xxx',
 'std',
 'chg',
 'send',
 '£150',
 'rcv',
 'even',
 'brother',
 'speak',
 'treat',
 'aid',
 'patent',
 'per',
 'request',
 'mell',
 'oru',
 'minnaminungint',
 'nurungu',
 'vettam',
 'set',
 'callertun',
 'caller',
 'press',
 '9',
 'copi',
 'friend',
 'winner',
 'valu',
 'network',
 'custom',
 'select',
 'receivea',
 '£900',
 'prize',
 'reward',
 'claim',
 'call',
 '0

In [53]:
# Test
X

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### **3.3. Split data into train/ val/ test**

In [54]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = VAL_SIZE,
                                                        shuffle = True,
                                                        random_state = SEED)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,   test_size = TEST_SIZE,
                                                                        shuffle = True,
                                                                        random_state = SEED)

## **4. Training model**

In [55]:
%%time
model = GaussianNB()
print('Start training...')
model = model.fit(X_train, y_train)
print('Training is completed!')

Start training...
Training is completed!
CPU times: total: 234 ms
Wall time: 347 ms


## **5. Model evalutation**

In [56]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Val accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

Val accuracy: 0.8816143497757848
Test accuracy: 0.8602150537634409


## **6. Prediction**

In [57]:
def predict(text, model, dictionary):
    processed_text = preprocess_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)[0]

    return prediction_cls

In [58]:
test_input = 'I am actually thinking a way of doing something useful'
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: ham
