<a href="https://colab.research.google.com/github/vitthal-bhandari/SMS-Spam-Detection/blob/master/spam_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
import os
import io
tf.__version__

'2.7.0'

In [None]:
# Download the zip file
path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip", origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip", extract=True)

# Unzip the file into a folder
!unzip $path_to_zip -d data

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Archive:  /root/.keras/datasets/smsspamcollection.zip
  inflating: data/SMSSpamCollection  
  inflating: data/readme             


In [None]:
# Let's see if we read the data correctly
lines = io.open('data/SMSSpamCollection').read().strip().split('\n')
lines[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
spam_dataset = []
count = 0
for line in lines:
  label, text = line.split('\t')
  if label.lower().strip() == 'spam':
    spam_dataset.append((1, text.strip()))
    count += 1
  else:
    spam_dataset.append(((0, text.strip())))
print(spam_dataset[0])
print("Spam: ", count)

(0, 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')
Spam:  747


In [None]:
import pandas as pd

df = pd.DataFrame(spam_dataset, columns=['Spam', 'Message'])

In [None]:
df.head()

Unnamed: 0,Spam,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
import re

def message_length(x):
  #returns total number of characters
  return len(x)

def num_capitals(x):
  _, count = re.subn(r'[A-Z]', '', x)
  return count

def num_punctuation(x):
  _, count = re.subn(r'\W', '', x)
  return count

In [None]:
df['Capitals'] = df['Message'].apply(num_capitals)
df['Punctuation'] = df['Message'].apply(num_punctuation)
df['Length'] = df['Message'].apply(message_length)
df.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488
std,0.340699,11.683233,14.825994,59.841746
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [None]:
train=df.sample(frac=0.8,random_state=42) #random state is a seed value
test=df.drop(train.index)
x_train = train[['Length', 'Capitals', 'Punctuation']]
y_train = train[['Spam']]
x_test = test[['Length', 'Capitals', 'Punctuation']]
y_test = test[['Spam']]

In [None]:
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439
std,0.339359,11.405424,14.602023,59.346407
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,35.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [None]:
test.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,1115.0,1115.0,1115.0,1115.0
mean,0.139013,6.030493,19.166816,80.95157
std,0.346116,12.731059,15.694599,61.807655
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,28.0,123.0
max,1.0,127.0,195.0,790.0


In [None]:
# Basic 1-layer neural network model for evaluation
def make_model(input_dims=3, num_units=12):
  model=tf.keras.Sequential()
  # Adds a densely-connected layer with 12 units to the model:
  model.add(tf.keras.layers.Dense(num_units, input_dim=input_dims, activation='relu'))
  # Add a sigmoid layer with a binary output unit:
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [None]:
x_train

Unnamed: 0,Length,Capitals,Punctuation
3690,25,1,4
3527,161,107,48
724,40,1,7
3370,69,3,17
468,37,1,8
...,...,...,...
3280,444,44,114
3186,65,50,14
3953,81,2,23
2768,38,2,8


In [None]:
model=make_model()
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0137660850>

In [None]:
model.evaluate(x_test, y_test)



[0.19691359996795654, 0.9345291256904602]

In [None]:
import numpy as np
y_train_pred=(model.predict(x_train) > 0.5).astype("int32")
# y_train_pred = model.predict(x_train)
# y_train_pred=np.argmax(y_train_pred,axis=1)

In [None]:
# confusion matrix
tf.math.confusion_matrix(tf.constant(y_train.Spam), y_train_pred)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[3742,  125],
       [ 154,  438]], dtype=int32)>

In [None]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.3.0-py3-none-any.whl (432 kB)
[?25l[K     |▊                               | 10 kB 22.4 MB/s eta 0:00:01[K     |█▌                              | 20 kB 23.1 MB/s eta 0:00:01[K     |██▎                             | 30 kB 10.3 MB/s eta 0:00:01[K     |███                             | 40 kB 8.5 MB/s eta 0:00:01[K     |███▉                            | 51 kB 5.5 MB/s eta 0:00:01[K     |████▌                           | 61 kB 5.6 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 5.5 MB/s eta 0:00:01[K     |██████                          | 81 kB 6.2 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 6.2 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 5.3 MB/s eta 0:00:01[K     |████████▍                       | 112 kB 5.3 MB/s eta 0:00:01[K     |█████████                       | 122 kB 5.3 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 5.3 MB/s eta 0:00:01[K  

In [None]:
import stanza

In [None]:
en = stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2021-12-25 14:50:31 INFO: Downloading default packages for language: en (English)...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.3.0/models/default.zip:   0%|          | 0…

2021-12-25 14:50:58 INFO: Finished downloading models and saved to /root/stanza_resources.


In [None]:
en = stanza.Pipeline(lang='en')

2021-12-25 14:50:58 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2021-12-25 14:50:58 INFO: Use device: gpu
2021-12-25 14:50:58 INFO: Loading: tokenize
2021-12-25 14:51:09 INFO: Loading: pos
2021-12-25 14:51:09 INFO: Loading: lemma
2021-12-25 14:51:09 INFO: Loading: depparse
2021-12-25 14:51:10 INFO: Loading: sentiment
2021-12-25 14:51:10 INFO: Loading: constituency
2021-12-25 14:51:11 INFO: Loading: ner
2021-12-25 14:51:11 INFO: Done loading processors!


In [None]:
sentence = 'Go until jurong point, crazy.. Available only in bugis n great world'
sentence.split()

['Go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world']

In [None]:
tokenized = en(sentence)

In [None]:
len(tokenized.sentences)

2

In [None]:
for snt in tokenized.sentences:
  for word in snt.tokens:
    print(word.text)
  print("<End of Sentence>")

Go
until
jurong
point
,
crazy
..
<End of Sentence>
Available
only
in
bugis
n
great
world
<End of Sentence>


In [None]:
!pip install stopwordsiso

Collecting stopwordsiso
  Downloading stopwordsiso-0.6.1-py3-none-any.whl (73 kB)
[?25l[K     |████▌                           | 10 kB 21.9 MB/s eta 0:00:01[K     |█████████                       | 20 kB 12.0 MB/s eta 0:00:01[K     |█████████████▍                  | 30 kB 9.9 MB/s eta 0:00:01[K     |█████████████████▉              | 40 kB 8.8 MB/s eta 0:00:01[K     |██████████████████████▎         | 51 kB 5.1 MB/s eta 0:00:01[K     |██████████████████████████▊     | 61 kB 5.7 MB/s eta 0:00:01[K     |███████████████████████████████▏| 71 kB 5.7 MB/s eta 0:00:01[K     |████████████████████████████████| 73 kB 1.4 MB/s 
[?25hInstalling collected packages: stopwordsiso
Successfully installed stopwordsiso-0.6.1


In [None]:
import stopwordsiso as stopwords

stopwords.langs()

{'af',
 'ar',
 'bg',
 'bn',
 'br',
 'ca',
 'cs',
 'da',
 'de',
 'el',
 'en',
 'eo',
 'es',
 'et',
 'eu',
 'fa',
 'fi',
 'fr',
 'ga',
 'gl',
 'gu',
 'ha',
 'he',
 'hi',
 'hr',
 'hu',
 'hy',
 'id',
 'it',
 'ja',
 'ko',
 'ku',
 'la',
 'lt',
 'lv',
 'mr',
 'ms',
 'nl',
 'no',
 'pl',
 'pt',
 'ro',
 'ru',
 'sk',
 'sl',
 'so',
 'st',
 'sv',
 'sw',
 'th',
 'tl',
 'tr',
 'uk',
 'ur',
 'vi',
 'yo',
 'zh',
 'zu'}

In [None]:
sorted(stopwords.stopwords('en'))

["'ll",
 "'tis",
 "'twas",
 "'ve",
 '10',
 '39',
 'a',
 "a's",
 'able',
 'ableabout',
 'about',
 'above',
 'abroad',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'ad',
 'added',
 'adj',
 'adopted',
 'ae',
 'af',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'ag',
 'again',
 'against',
 'ago',
 'ah',
 'ahead',
 'ai',
 "ain't",
 'aint',
 'al',
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'am',
 'amid',
 'amidst',
 'among',
 'amongst',
 'amoungst',
 'amount',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'ao',
 'apart',
 'apparently',
 'appear',
 'appreciate',
 'appropriate',
 'approximately',
 'aq',
 'ar',
 'are',
 'area',
 'areas',
 'aren',
 "aren't",
 'arent',
 'arise',
 'around',
 'arpa',
 'as',
 'aside',
 'ask',
 'asked',
 'asking',
 'asks',
 'associated

In [None]:
en = stanza.Pipeline(lang='en', processors='tokenize')
en_sw = stopwords.stopwords('en')
def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count=0
  for sentence in doc.sentences:
    for token in sentence.tokens:
      if token.text.lower() not in en_sw:
        count += 1
  return count

2021-12-25 10:27:56 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-12-25 10:27:56 INFO: Use device: gpu
2021-12-25 10:27:56 INFO: Loading: tokenize
2021-12-25 10:27:56 INFO: Done loading processors!


In [None]:
df['Words'] = df['Message'].apply(word_counts)

In [None]:
train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)

In [None]:
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals' , 'Words']]
y_test = test[['Spam']]

model = make_model(input_dims=4)

In [None]:
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3458143610>

In [None]:
model.evaluate(x_test, y_test)



[0.2425527572631836, 0.913004457950592]

In [None]:
#after stopwords removal
train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]
x_test = test[['Length', 'Punctuation', 'Capitals', 'Words']]
y_test = test[['Spam']]
model = make_model(input_dims=4)
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3458348810>

In [None]:
model.evaluate(x_test, y_test)



[0.2997250556945801, 0.8762331604957581]

In [None]:
en = stanza.Pipeline(lang='en')

txt = "Yo you around? A friend of mine's lookin."
pos = en(txt)

2021-12-25 10:32:42 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2021-12-25 10:32:42 INFO: Use device: gpu
2021-12-25 10:32:42 INFO: Loading: tokenize
2021-12-25 10:32:42 INFO: Loading: pos
2021-12-25 10:32:42 INFO: Loading: lemma
2021-12-25 10:32:42 INFO: Loading: depparse
2021-12-25 10:32:43 INFO: Loading: sentiment
2021-12-25 10:32:43 INFO: Loading: constituency
2021-12-25 10:32:44 INFO: Loading: ner
2021-12-25 10:32:45 INFO: Done loading processors!


In [None]:
def print_pos(doc):
    text = ""
    for sentence in doc.sentences:
        for token in sentence.tokens:
            text += token.words[0].text + "/" + token.words[0].upos + " "
        text += "\n"
    return text

In [None]:
print(print_pos(pos))

Yo/PRON you/PRON around/ADV ?/PUNCT 
A/DET friend/NOUN of/ADP mine/PRON 's/AUX lookin/ADJ ./PUNCT 



In [None]:
en = stanza.Pipeline(lang='en', processors='tokenize')
en_sw = stopwords.stopwords('en')

def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  totals = 0.
  count = 0.
  non_word = 0.
  for sentence in doc.sentences:
    totals += len(sentence.tokens)  # (1)
    for token in sentence.tokens:
        if token.text.lower() not in en_sw:
          if token.words[0].upos not in ['PUNCT', 'SYM']:
            count += 1.
          else:
            non_word += 1.
  non_word = non_word / totals
  return pd.Series([count, non_word], index=['Words_NoPunct', 'Punct'])

2021-12-25 10:32:45 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-12-25 10:32:45 INFO: Use device: gpu
2021-12-25 10:32:45 INFO: Loading: tokenize
2021-12-25 10:32:45 INFO: Done loading processors!


In [None]:
train_tmp = train['Message'].apply(word_counts_v3)
train = pd.concat([train, train_tmp], axis=1)
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Words_NoPunct,Punct
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439,9.213725,9.213725,0.0
std,0.339359,11.405424,14.602023,59.346407,8.003355,8.003355,0.0
min,0.0,0.0,0.0,2.0,0.0,0.0,0.0
25%,0.0,1.0,8.0,35.0,4.0,4.0,0.0
50%,0.0,2.0,15.0,61.0,7.0,7.0,0.0
75%,0.0,4.0,27.0,122.0,13.0,13.0,0.0
max,1.0,129.0,253.0,910.0,157.0,157.0,0.0


In [None]:
test_tmp = test['Message'].apply(word_counts_v3)
test = pd.concat([test, test_tmp], axis=1)
test.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Words_NoPunct,Punct
count,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0
mean,0.139013,6.030493,19.166816,80.95157,9.498655,9.498655,0.0
std,0.346116,12.731059,15.694599,61.807655,8.212,8.212,0.0
min,0.0,0.0,0.0,2.0,0.0,0.0,0.0
25%,0.0,1.0,8.0,36.0,4.0,4.0,0.0
50%,0.0,2.0,15.0,61.0,7.0,7.0,0.0
75%,0.0,4.0,28.0,123.0,14.0,14.0,0.0
max,1.0,127.0,195.0,790.0,82.0,82.0,0.0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
corpus = [
"I like fruits. Fruits like bananas",
"I love bananas but eat an apple",
"An apple a day keeps the doctor away"
]

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['an', 'apple', 'away', 'bananas', 'but', 'day', 'doctor', 'eat',
       'fruits', 'keeps', 'like', 'love', 'the'], dtype=object)

In [None]:
X.toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 2, 0, 0],
       [1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1]])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X.toarray())

array([[1.        , 0.13608276, 0.        ],
       [0.13608276, 1.        , 0.3086067 ],
       [0.        , 0.3086067 , 1.        ]])

In [None]:
query = vectorizer.transform(["apple and bananas"])
cosine_similarity(X, query)

array([[0.23570226],
       [0.57735027],
       [0.26726124]])

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X.toarray())
pd.DataFrame(tfidf.toarray(),
columns=vectorizer.get_feature_names())



Unnamed: 0,an,apple,away,bananas,but,day,doctor,eat,fruits,keeps,like,love,the
0,0.0,0.0,0.0,0.230408,0.0,0.0,0.0,0.0,0.688081,0.0,0.688081,0.0,0.0
1,0.321267,0.321267,0.0,0.321267,0.479709,0.0,0.0,0.479709,0.0,0.0,0.0,0.479709,0.0
2,0.275785,0.275785,0.411797,0.0,0.0,0.411797,0.411797,0.0,0.0,0.411797,0.0,0.0,0.411797


In [None]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
model_w2v = api.load("word2vec-google-news-300")

