In [1]:
#Use-case : SMS Spam Classification
#Goal: You need to create a model that can predict whether the given sms is a spam or ham sms

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('SMSSpamCollection', sep='\t', names=['label','message'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [6]:
# Since the dataset in unbalanced, for generalization we will use ACCURACY as the metric
# for quality we will use F1 Score

In [7]:
# Seperate the data as features and label
# Will ensure they are in Numpy form

features = data.iloc[:,[1]].values
label = data.iloc[:,[0]].values

In [8]:
# Performing Text Preprocessing

# We will create a text preprocessing function that can perform teh following:
# 1. Remove Punctuations
# 2. Extract words out of the sentences
# 3. Normalize the data (lowercase)
# 4. Remove Stopwords

In [14]:
import string

text = "Welcome to Simplilearn! You are learning N.L.P."

#for char in text:
#  if char not in string.punctuation:
#    print(char)


''.join([char for char in text  if char not in string.punctuation])

'Welcome to Simplilearn You are learning NLP'

In [16]:
sent = ''.join([char for char in text  if char not in string.punctuation])
sent.split()

['Welcome', 'to', 'Simplilearn', 'You', 'are', 'learning', 'NLP']

In [18]:
#for word in sent.split():
#  print(word.lower())

[ word.lower() for word in sent.split()]

['welcome', 'to', 'simplilearn', 'you', 'are', 'learning', 'nlp']

In [21]:
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
import nltk
from nltk.corpus import stopwords
import string

def textPreprocessing(document):
  #1. Remove Punctuations
  sentWithoutPunct = ''.join([char for char in document  if char not in string.punctuation])
  #2. Extract words out of the sentences
  words = sentWithoutPunct.split()
  #3. Normalize the data (lowercase)
  wordNormalized = [word.lower() for word in words]
  # 4. Remove Stopwords
  vocabulary = [word for word in wordNormalized if word not in stopwords.words('english')]

  return vocabulary



In [23]:
textPreprocessing("Welcome to Simplilearn! You are enjoying N.L.P.")

['welcome', 'simplilearn', 'enjoying', 'nlp']

In [24]:
# Create BOW in SKlearn

from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=textPreprocessing)

#Build the Vocabulary
finalWordVectorVocab = wordVector.fit(features)

In [25]:
finalWordVectorVocab.vocabulary_

{'go': 5383,
 'jurong': 6667,
 'point,': 9177,
 'crazy..': 3569,
 'available': 2053,
 'bugis': 2687,
 'n': 8081,
 'great': 5518,
 'world': 12979,
 'la': 6872,
 'e': 4290,
 'buffet...': 2685,
 'cine': 3172,
 'got': 5475,
 'amore': 1735,
 'wat...': 12612,
 'ok': 8534,
 'lar...': 6928,
 'joking': 6624,
 'wif': 12824,
 'u': 12147,
 'oni...': 8607,
 'free': 5075,
 'entry': 4492,
 '2': 748,
 'wkly': 12907,
 'comp': 3349,
 'win': 12844,
 'fa': 4685,
 'cup': 3639,
 'final': 4861,
 'tkts': 11811,
 '21st': 785,
 'may': 7564,
 '2005.': 774,
 'text': 11540,
 '87121': 1256,
 'receive': 9702,
 'question(std': 9539,
 'txt': 12123,
 "rate)t&c's": 9613,
 'apply': 1854,
 "08452810075over18's": 369,
 'dun': 4272,
 'say': 10191,
 'early': 4308,
 'hor...': 6015,
 'c': 2759,
 'already': 1689,
 'say...': 10195,
 'nah': 8093,
 'think': 11677,
 'goes': 5410,
 'usf,': 12348,
 'lives': 7183,
 'around': 1924,
 'though': 11714,
 'freemsg': 5094,
 'hey': 5856,
 'darling': 3733,
 '3': 879,
 "week's": 12686,
 'word':

In [26]:
# To create BOW
bagOfWords = finalWordVectorVocab.transform(features)

In [27]:
# Apply TFIDF Algo on BOW to create a feature set

from sklearn.feature_extraction.text import TfidfTransformer
tfidfObject = TfidfTransformer().fit(bagOfWords)  #Calc IDF Values

In [28]:
# Lets create Numeric Feature set

processedFeatures = tfidfObject.transform(bagOfWords)

In [42]:
# Create Train Test Split (90% training -10% testing)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(processedFeatures,
                                                 label,
                                                 test_size=0.1,
                                                 random_state=1)

In [43]:
#Build a Model using LogisticRegression

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)


LogisticRegression()

In [44]:
#Check whether the model is generalized or not
print("Training score is ",model.score(X_train,y_train))
print("Testing score is ",model.score(X_test,y_test))

Training score is  0.9623055444754687
Testing score is  0.9695340501792115


In [45]:
# Check whether the Quality of the model adheres the expected standards
# considering SL = 0.45

from sklearn.metrics import classification_report

print(classification_report(label,model.predict(processedFeatures)))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      4825
        spam       0.99      0.73      0.84       747

    accuracy                           0.96      5572
   macro avg       0.98      0.86      0.91      5572
weighted avg       0.96      0.96      0.96      5572



In [47]:
# Deploy

smsInput = input("Enter SMS: ")

# Preprocess the input

preProcessedFeature = textPreprocessing(smsInput)

#BOW

bowFeature = finalWordVectorVocab.transform(preProcessedFeature)

#TFIDF

processFeature = tfidfObject.transform(bowFeature)

#Predict

predLabel = model.predict(processFeature)[0]

print("Given SMS is a {} sms".format(predLabel))



Enter SMS: Win Lottery Guaranteed!
Given SMS is a spam sms
