In [19]:
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer as ps
from nltk.corpus import stopwords as sw
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:

df = pd.read_csv('/content/sample_data/spam.csv', encoding='ISO-8859-1')
le = LabelEncoder()

In [15]:
data = df.to_numpy()

In [16]:
X = data[:, 1]
y = data[:, 0]

In [17]:

X.shape, y.shape

((5572,), (5572,))

In [20]:

tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [21]:
def getStem(review):
    review = review.lower()
    tokens = tokenizer.tokenize(review) # breaking into small words
    removed_stopwords = [w for w in tokens if w not in sw]
    stemmed_words = [ps.stem(token) for token in removed_stopwords]
    clean_review = ' '.join(stemmed_words)
    return clean_review

In [22]:
# get a clean document
def getDoc(document):
    d = []
    for doc in document:
        d.append(getStem(doc))
    return d

In [23]:
stemmed_doc = getDoc(X)

In [24]:
stemmed_doc[:10]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send å 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea å 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030']

In [25]:
cv = CountVectorizer()

In [26]:
# create my vocab
vc = cv.fit_transform(stemmed_doc)

In [27]:

X = vc.todense()

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [29]:

from sklearn.naive_bayes import MultinomialNB

In [30]:
model = MultinomialNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.977705274605764

In [31]:
messages = [
    """
    Dear Gaana Plus user,
Your Gaana Plus Gaana Plus* subscription is set to expire on 15/05/2021. Please click on the below link to renew:
http://gaana.com/profile
We have added new pocket friendly plans. Check out the new plans here
    """,
    """
    Learning SEO basics is not nearly as technical as it may seem. SEO is less about filling your site with the right keywords (what people type into Google searches), and more about writing great content that your audience finds useful.
    """,

    """
    Hi there,


Are you already selling online but not seeing the performance you want?

If you answered a resounding YES to that, then it may be worth revisiting the most important step when selling online:
"""
]

In [32]:
def prepare(messages):
    d = getDoc(messages)
    # dont do fit_transform!! it will create new vocab.
    return cv.transform(d)

messages = prepare(messages)

In [33]:
y_pred = model.predict(messages)
y_pred

array(['spam', 'ham', 'ham'], dtype='<U4')