In [1]:
import pandas as pd
import numpy as np
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [2]:
# load dataset and reorder column
df = pd.read_table('sms', names=['class', 'text'])
df = df[['text', 'class']]

df.head()

Unnamed: 0,text,class
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [3]:
f = open("stopwords.txt", "r")
stopwords = f.read().split("\n")

In [4]:
def stopword_removal(df, stopwords):
    row, col = df.shape
    for i in range(row):
        sentence = df.loc[i]['text']
        cleaned = []
        for word in sentence.split():
            if word not in stopwords:
                cleaned.append(word)
        df.loc[i]['text'] = ' '.join(cleaned)
    return df

In [5]:
def tokenize(df):
    row, col = df.shape
    for i in range(row):
        sentence = df.loc[i]['text']
        lowercase = sentence.lower() # case folding
        regex = re.compile(r'[^a-zA-Z]')
        alphanumeric = re.sub(regex, ' ', lowercase) # remove if not alphabet
        stripped = ' '.join(alphanumeric.split()) # strip whitespace
        df.loc[i]['text'] = stripped
    return df

In [6]:
def stem(df):
    stemmer = SnowballStemmer('english')
    row, col = df.shape
    for i in range(row):
        sentence = df.loc[i]['text']
        stemed_words = []
        for word in sentence.split():
            stemed_word = stemmer.stem(word)
            stemed_words.append(stemed_word)
        df.loc[i]['text'] = ' '.join(stemed_words)
    return df

In [7]:
tokenized = tokenize(df)
filtered = stopword_removal(tokenized, stopwords)
stemed = stem(filtered)

In [8]:
def get_unique_words(df):
    row, col = df.shape
    unique_words = set()
    for i in range(row):
        sentence = df.loc[i]['text']
        for word in sentence.split():
            unique_words.add(word)
    return unique_words

In [9]:
def get_tf_matrix(df, unique_words):
    tf_matrix = []
    row, col = df.shape
    word_freq = {}

    #set frequency to 0 for all term
    for word in unique_words:
        word_freq[word] = 0

    for i in range(row):
        sentence = df.loc[i]['text']
        freq = word_freq.copy()
        #increment every word frequency in sentence
        for word in sentence.split():
            freq[word] += 1

        tf_matrix.append(freq)
    return tf_matrix

In [10]:
def tf_matrix_to_array(tf_matrix):
    tf_matrix_array = []
    row = len(tf_matrix)
    for i in range(row):
        feature = np.array(list(tf_matrix[i].values()))
        tf_matrix_array.append(feature)
    return np.vstack(tf_matrix_array)

In [11]:
unique_words = get_unique_words(df)
tf_matrix = get_tf_matrix(df, unique_words)

In [12]:
X = tf_matrix_to_array(tf_matrix)

In [13]:
#split training and training data set
X_train, X_test, y_train, y_test = train_test_split(X, df['class'], test_size=0.2)

In [14]:
# create and train mnb classifier
classifier = MultinomialNB() # buat classifier
classifier.fit(X_train, y_train)  # training classifier dengan data training

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
sample = pd.DataFrame({'text': [
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
]})
tokenized = tokenize(sample)
filtered = stopword_removal(tokenized, stopwords)
stemed = stem(filtered)
tf_matrix = get_tf_matrix(stemed, unique_words)
tf_array = tf_matrix_to_array(tf_matrix)
classifier.predict(tf_array) # ubah input term frequency ke array dan lakukan prediksi

array(['spam'], 
      dtype='<U4')

In [16]:
# measure accuracy
n, feature_size = X_test.shape
accuracy = np.sum(classifier.predict(X_test) == y_test) / n # sum of correct prediction / number of test dataset
print("Naive bayes accuracy : %f%%" % (accuracy * 100))

Naive bayes accuracy : 97.130045%


In [17]:
print(np.sum(classifier.predict(X_test) == 'spam'))
print(np.sum(classifier.predict(X_test) == 'ham'))

164
951
