In [39]:
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
#convert into numeric data
from sklearn.preprocessing import LabelEncoder# to convert into numbers
from sklearn.feature_extraction.text import CountVectorizer# using sklearn feature extraction method and importing count vectorizer cv which count the frequecy of word occured in the senctence
from sklearn.model_selection import train_test_split

In [40]:
# as it is documentation so we need encoding
df = pd.read_csv('datasets/bayes/spam.csv', encoding='ISO-8859-1')
le = LabelEncoder()

In [41]:
data = df.to_numpy()#same as df.values

In [42]:
X = data[:, 1]#all rows and first column i.e our answer
y = data[:, 0]#all rows and zeroth column

In [43]:
X.shape, y.shape

((5572,), (5572,))

In [44]:
tokenizer = RegexpTokenizer('\w+')#to extract words
sw = set(stopwords.words('english'))#for stopwords
ps = PorterStemmer()

In [45]:
#removing all stem words now
def getStem(review):
    review = review.lower()#first change the data to lower case
    tokens = tokenizer.tokenize(review) # breaking into small words
    removed_stopwords = [w for w in tokens if w not in sw]
    stemmed_words = [ps.stem(token) for token in removed_stopwords]#stemmed words are the filtered tokens after removing stopwords
    clean_review = ' '.join(stemmed_words)#now join all filter words
    return clean_review

In [46]:
#get a clean document
def getDoc(document):
    d = []
    for doc in document:
        d.append(getStem(doc))
    return d


In [47]:
stemmed_doc = getDoc(X)

In [48]:
stemmed_doc[:10]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send ã â 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea ã â 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030']

In [49]:
cv = CountVectorizer()#now we count the frequecy of words occured in sentence

In [50]:
# create my vocab
vc = cv.fit_transform(stemmed_doc)

In [51]:
X = vc.todense()

In [52]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [53]:
# NB from sklearn

In [54]:
from sklearn.naive_bayes import MultinomialNB


In [55]:
model = MultinomialNB()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.977705274605764

In [66]:
messages = [
    """
    Hi Tilakraj,
We invite you to participate in MishMash - India’s largest online diversity hackathon. 
The hackathon is a Skillenza initiative and sponsored by Microsoft, Unity, Unilever, Gojek, Rocketium and Jharkhand Government. 
We have a special theme for you - Deep Tech/Machine Learning - sponsored by Unilever, which will be perfect for you.
    """,
    """Join us today at 12:00 PM ET / 16:00 UTC for a Red Hat DevNation tech talk on AWS Lambda and serverless Java with Bill Burke.
Have you ever tried Java on AWS Lambda but found that the cold-start latency and memory usage were far too high? 
In this session, we will show how we optimized Java for serverless applications by leveraging GraalVM with Quarkus to 
provide both supersonic startup speed and a subatomic memory footprint.""",

    """We really appreciate your interest and wanted to let you know that we have received your application.
There is strong competition for jobs at Intel, and we receive many applications. As a result, it may take some time to get back to you.
Whether or not this position ends up being a fit, we will keep your information per data retention policies, 
so we can contact you for other positions that align to your experience and skill set.
""",
    """Hi Tilakraj Singh Rao

We found your profile matching job based on your profile with some top MNC clients, tried calling your mobile 9636003356 too.

As a next step, Please give a short 5 mins Online Aptitude Test ->  https://www.youth4work.com/onlinetalenttest/Test-Aptitude and revert if you could score +90%
This test has been given by over 2Million people across the world and gives a relative score of yours vs all others. Will schedule a telephonic interview call for top rankers faster

Look forward to helping you get jobs according to your talents & skills""",
    """Free Online Courses – To boost
your skills

Hello Tilakraj Rao,

Launching Analyttica TreasureHunt Leaps, a platform that will provide free courses and webinar access.

You can also participate in hackathons and win lots of rewards and prizes."""
]

In [67]:
#the above messages needs to be clean 

In [68]:
def prepare(messages):
    d = getDoc(messages)
    # dont do fit_transform!! it will create new vocab.
    return cv.transform(d)

messages = prepare(messages)

In [69]:
y_pred = model.predict(messages)

In [70]:
y_pred

array(['ham', 'spam', 'ham', 'ham', 'spam'], dtype='<U4')