In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("Naive spam_emails__15.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
#import spacy and load the model

import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp('some text data')
for token in doc:
    print(token,token.pos_, token.lemma_)

some DET some
text NOUN text
data NOUN datum


In [5]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)

In [6]:
df.shape

(5572, 3)

In [7]:
from sklearn.model_selection import train_test_split

# here X refers to all the emails - the messages itself
# X_train = 80 % emails , X_test = 20 % emails and y_train = 80 % bool and y_test = 20 % bool value
# and y refers to spam ie 0 or 1  - we check if message is spam or not 
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size = 0.2 )

# train_test_split takes 2 param , input and output 
# input == all the emails and output is 0 or 1 
# so we train this model in such a way that if any new email comes 
# we can check is it 0 or 1 ie is it spam or not ! 

In [8]:
# after splitting 
X_train.shape
type(X_train)  # type = pandas.core.series.Series

pandas.core.series.Series

In [9]:
X_train
y_train[:4]
X_train.values

array(["I lost 4 pounds since my doc visit last week woot woot! Now I'm gonna celebrate by stuffing my face!",
       'Oh! Shit, I thought that was your trip! Loooooool ... That just makes SO much more sense now ... *grins* and the sofa reference was ... The "sleep on a couch" link you sent me ... Wasn\'t that how you went on your trip ? Oh ... And didn\'t your babe go with you for that celebration with your rents?',
       'Ur cash-balance is currently 500 pounds - to maximize ur cash-in now send CASH to 86688 only 150p/msg. CC: 08718720201 PO BOX 114/14 TCR/W1',
       ..., "Okay. I've seen it. So i should pick it on friday?",
       "Yep get with the program. You're slacking.",
       'Oh ho. Is this the first time u use these type of words'],
      dtype=object)

In [10]:
#  Building vocabulary from countVectorizer  - all unique words 
# Generated from the emails X_train
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

x_train_cv = v.fit_transform(X_train.values) # .values because we dont need the header
x_train_cv  # == vocabulary
x_train_cv.

<4457x7758 sparse matrix of type '<class 'numpy.int64'>'
	with 59044 stored elements in Compressed Sparse Row format>

In [12]:
#  converting matrix to array
# x_train_cv.toarray()

# x_train_cv.shape  # total vocabulary = 7801 unique words , out of which 4457 total emails presents



In [12]:
# Creating model taking input as vocalubary
# and  output as y_train

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train_cv, y_train) 


MultinomialNB()

In [13]:
x_test_cv = v.transform(X_test)


In [14]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       967
           1       0.99      0.95      0.97       148

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [15]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!',
    "Upto 20% discount on parking"
]

emails_count = v.transform(emails)
model.predict(emails_count)


# from emails given above, we get the response as 0 or 1
# whether it is spam or not 
# not 100 % correct ! 

array([0, 1, 1], dtype=int64)

In [16]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [5]:
#        STOP WORDS

from spacy.lang.en.stop_words import STOP_WORDS

doc = doc = nlp("We just opened our wings, the flying part is coming soon")

# Listed all stop words
for token in doc:
    if token.is_stop:
#         print(token)
        ...
        
def preprocess(text):
    """
    returns the string without STOP WORDS 
    """
    doc = nlp(text)
    
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)  


preprocess("Musk wants time to prepare for a trial over his")   # STOP WORDS removed 

'Musk wants time prepare trial'

In [28]:
df.shape

(5572, 3)

In [31]:
 # WILL remove all the stop word for whole column
df['message without STOP words'] = df.Message.apply(preprocess)  


Unnamed: 0,Category,Message,spam,message without STOP words
0,ham,"Go until jurong point, crazy.. Available only ...",0,"jurong point , crazy .. Available bugis n grea..."
1,ham,Ok lar... Joking wif u oni...,0,Ok lar ... Joking wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,U dun early hor ... U c ...
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,"Nah think goes usf , lives"


In [33]:
df.head()

Unnamed: 0,Category,Message,spam,message without STOP words
0,ham,"Go until jurong point, crazy.. Available only ...",0,"jurong point , crazy .. Available bugis n grea..."
1,ham,Ok lar... Joking wif u oni...,0,Ok lar ... Joking wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,U dun early hor ... U c ...
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,"Nah think goes usf , lives"


In [15]:
#                                          EXERCISES                                         


#  Language translation: Say you want to translate following sentence from english to telugu. 
#     Before actual translation if you remove stop words and then translate, it will produce horrible result

# preprocess("how are you doing dhaval?")

#  Sentiment detection: Not always but in some cases,
# based on your dataset it can change the sentiment of a sentence if you remove stop words
# Exercise 1:

# Spacy default implementation considers "not" as a stop word. 
# But in some scenarios removing 'not' will completely change 
# the meaning of the statement/text. For Example, consider these two statements:
    # - this is a good movie       ----> Positive Statement
    # - this is not a good movie   ----> Negative Statement

preprocess("this is a good movie")   # --> 'good movie'

preprocess("this is not a good movie")   # --> 'good movie'


#Step1: remove the stopword 'not' in spacy
nlp.vocab['not'].is_stop = False

#step2: send the two texts given above into the pre-process function and store the transformed texts
positive_text = preprocess('this is a good movie')
negative_text = preprocess('this is not a good movie')

positive_text, negative_text   # --> ('good movie', 'not good movie')




('good movie', 'not good movie')

In [8]:
#  Exercise 2 
 text = '''
Thor: Love and Thunder is a 2022 American superhero film based on Marvel Comics featuring the character Thor, produced by Marvel Studios and 
distributed by Walt Disney Studios Motion Pictures. It is the sequel to Thor: Ragnarok (2017) and the 29th film in the Marvel Cinematic Universe (MCU).
The film is directed by Taika Waititi, who co-wrote the script with Jennifer Kaytin Robinson, and stars Chris Hemsworth as Thor alongside Christian Bale, Tessa Thompson,
Jaimie Alexander, Waititi, Russell Crowe, and Natalie Portman. In the film, Thor attempts to find inner peace, but must return to action and recruit Valkyrie (Thompson),
Korg (Waititi), and Jane Foster (Portman)—who is now the Mighty Thor—to stop Gorr the God Butcher (Bale) from eliminating all gods.
'''
stopped_words = 0
req_words = 0
doc = nlp(text)

# Listed all stop words
for token in doc:
    if token.is_stop:
        stopped_words += 1
    else:
        req_words+=1
print(stopped_words, req_words)

#step2: print the percentage of stop words compared to total words in the text
percentage_stop_words = (stopped_words / req_words) * 100
print(f"Percentage of Stop words presented in the given text: {percentage_stop_words} %")

40 120
Percentage of Stop words presented in the given text: 33.33333333333333 %


In [61]:
# Exercise3:

# From a given text, output the most frequently used 
# token after removing all the stop word tokens and punctuations in it.

text = ''' The India men's national cricket team, also known as Team India or the Men in Blue, represents India in men's international cricket.
It is governed by the Board of Control for Cricket in India (BCCI), and is a Full Member of the International Cricket Council (ICC) with Test,
One Day International (ODI) and Twenty20 International (T20I) status. Cricket was introduced to India by British sailors in the 18th century, and the 
first cricket club was established in 1792. India's national cricket team played its first Test match on 25 June 1932 at Lord's, becoming the sixth team to be
granted test cricket status.
'''

# #step2: remove all the stop words and punctuations and store all the remaining tokens in a new list
# remaining_tokens = []
# for token in doc:
#   if token.is_stop or token.is_punct:    #check whether a given token is stop word or punctuations
#     continue
#   remaining_tokens.append(token.text)

# remaining_tokens
total_freq = {}
doc = nlp(text)
for token in doc:
    if not token.is_punct or token.is_stop :
            if  token.is_alpha:
#                 print(str(token).lower())   # DO THIS OR      token.text    to convert from spacy token to text
                token = str(token).lower()
                if token in total_freq:
                    total_freq[token] +=1
                else:
                    total_freq[token] =1

total_freq

                    
# FINDING MAX AlPHA comming after 
# va = 0
# for k,v in total_freq.items():
#     if v > va:
#         va = v
#         print(k,v)

            


{'the': 7,
 'india': 6,
 'men': 3,
 'national': 2,
 'cricket': 8,
 'team': 4,
 'also': 1,
 'known': 1,
 'as': 1,
 'or': 1,
 'in': 5,
 'blue': 1,
 'represents': 1,
 'international': 4,
 'it': 1,
 'is': 2,
 'governed': 1,
 'by': 2,
 'board': 1,
 'of': 2,
 'control': 1,
 'for': 1,
 'bcci': 1,
 'and': 3,
 'a': 1,
 'full': 1,
 'member': 1,
 'council': 1,
 'icc': 1,
 'with': 1,
 'test': 3,
 'one': 1,
 'day': 1,
 'odi': 1,
 'status': 2,
 'was': 2,
 'introduced': 1,
 'to': 2,
 'british': 1,
 'sailors': 1,
 'century': 1,
 'first': 2,
 'club': 1,
 'established': 1,
 'played': 1,
 'its': 1,
 'match': 1,
 'on': 1,
 'june': 1,
 'at': 1,
 'lord': 1,
 'becoming': 1,
 'sixth': 1,
 'be': 1,
 'granted': 1}