In [1]:
pip install tensorflow



In [2]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]


In [10]:
data = pd.read_csv('spam.csv',encoding='latin-1') 
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":"label", "v2":"text"})
print(data.describe())
data.groupby("label").describe()

       label                    text
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30


Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [12]:
#The dataset contains 4825 ham and 747 spam messages.
#For both classes, some messages appear more than once (common phrases, etc.).

data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [13]:
#Add numerical label for spam
#Target must be numerical for ML classification models
data['spam'] = data['label'].map( {'spam': 1, 'ham': 0} ).astype(int)
data[0:10]

Unnamed: 0,label,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [14]:
#Add feature: length of message
data['length'] = data['text'].apply(len)
data[0:10]

Unnamed: 0,label,text,spam,length
0,ham,"Go until jurong point, crazy.. Available only ...",0,111
1,ham,Ok lar... Joking wif u oni...,0,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,155
3,ham,U dun say so early hor... U c already then say...,0,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,61
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1,148
6,ham,Even my brother is not like to speak with me. ...,0,77
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0,160
8,spam,WINNER!! As a valued network customer you have...,1,158
9,spam,Had your mobile 11 months or more? U R entitle...,1,154


In [15]:
data_ham  = data[data['spam'] == 0].copy()
data_spam = data[data['spam'] == 1].copy()

In [16]:
#Remove Punctuation and Stopwords
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
def remove_punctuation_and_stopwords(sms):
    
    sms_no_punctuation = [ch for ch in sms if ch not in string.punctuation]
    sms_no_punctuation = "".join(sms_no_punctuation).split()
    
    sms_no_punctuation_no_stopwords = \
        [word.lower() for word in sms_no_punctuation if word.lower() not in stopwords]
        
    return sms_no_punctuation_no_stopwords

In [21]:
data['text'].apply(remove_punctuation_and_stopwords)
data['text'][0:10]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: text, dtype: object

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer = remove_punctuation_and_stopwords).fit(data['text'])
print(len(bow_transformer.vocabulary_))
print(type(bow_transformer.vocabulary_))
bow_transformer.vocabulary_

9449
<class 'dict'>


{'go': 3750,
 'jurong': 4647,
 'point': 6388,
 'crazy': 2463,
 'available': 1375,
 'bugis': 1840,
 'n': 5631,
 'great': 3847,
 'world': 9125,
 'la': 4807,
 'e': 2990,
 'buffet': 1838,
 'cine': 2179,
 'got': 3807,
 'amore': 1142,
 'wat': 8886,
 'ok': 5952,
 'lar': 4846,
 'joking': 4615,
 'wif': 9019,
 'u': 8549,
 'oni': 5984,
 'free': 3541,
 'entry': 3125,
 '2': 415,
 'wkly': 9076,
 'comp': 2295,
 'win': 9034,
 'fa': 3263,
 'cup': 2518,
 'final': 3387,
 'tkts': 8326,
 '21st': 435,
 'may': 5294,
 '2005': 422,
 'text': 8165,
 '87121': 838,
 'receive': 6787,
 'questionstd': 6677,
 'txt': 8533,
 'ratetcs': 6730,
 'apply': 1228,
 '08452810075over18s': 72,
 'dun': 2976,
 'say': 7143,
 'early': 2997,
 'hor': 4178,
 'c': 1889,
 'already': 1116,
 'nah': 5638,
 'dont': 2884,
 'think': 8239,
 'goes': 3764,
 'usf': 8679,
 'lives': 5008,
 'around': 1279,
 'though': 8258,
 'freemsg': 3549,
 'hey': 4074,
 'darling': 2583,
 '3': 520,
 'weeks': 8939,
 'now': 5844,
 'no': 5773,
 'word': 9111,
 'back': 14

In [31]:
bow_data = bow_transformer.transform(data['text'])
bow_data.shape

(5572, 9449)

##Analysis

To analyze the text data, we have to turn the words into numerical numbers. 
We have multiple choices to accomplish this step: 

1) Binary Term Frequency :  count presence(1) or absence(0) for term in document

2) Bag of Words Frequency:  captures the frequency of term in document

3) Term Frequency: 

4) TFIDF :

in this way, if a term appears frequently in a document, it’s important; if a term appears in many documents, it’s not a unique identifier.

Word2Vec.

In [35]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(bow_data)

sample_ham = data['text'][5]
bow_sample_ham = bow_transformer.transform([sample_ham])
tfidf_sample_ham = tfidf_transformer.transform(bow_sample_ham)
print(tfidf_sample_ham)
print('---------------------------------------')
sample_spam = data['text'][5]
bow_sample_spam = bow_transformer.transform([sample_spam])
tfidf_sample_spam = tfidf_transformer.transform(bow_sample_spam)
print(tfidf_sample_spam)

  (0, 9395)	0.22355928596527508
  (0, 9250)	0.21772285717486242
  (0, 9111)	0.21563270494897763
  (0, 8939)	0.2355115670310063
  (0, 8095)	0.2847559861555369
  (0, 7807)	0.1634669961682486
  (0, 7782)	0.2601337765932716
  (0, 7237)	0.1561057693607126
  (0, 6742)	0.30290170745527656
  (0, 5952)	0.1429266188866884
  (0, 5844)	0.12397736640603786
  (0, 5773)	0.13735732158192449
  (0, 4958)	0.14922006929305762
  (0, 4297)	0.21772285717486242
  (0, 4074)	0.17463345498191235
  (0, 3612)	0.21994372652941777
  (0, 3549)	0.25081398408376504
  (0, 2583)	0.27827949789301126
  (0, 2124)	0.31730477673409957
  (0, 1426)	0.16465514504215803
  (0, 520)	0.1941937289227732
---------------------------------------
  (0, 9395)	0.22355928596527508
  (0, 9250)	0.21772285717486242
  (0, 9111)	0.21563270494897763
  (0, 8939)	0.2355115670310063
  (0, 8095)	0.2847559861555369
  (0, 7807)	0.1634669961682486
  (0, 7782)	0.2601337765932716
  (0, 7237)	0.1561057693607126
  (0, 6742)	0.30290170745527656
  (0, 5952)	0

#Next we use CountVectorizer:

More Details and example at:

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [36]:
#MINOR CORRECTION IN LAB'S CODE:

#BY MISTAKE WE HAVE APPLIED "FIT_TRANSFORM" TO TEST DATA ALSO INSTEAD OF ONLY "TRANSFORM"
#AND WHENEVER WE HAVE CORRECTED IT, THE XTRAIN,XTEST WAS ALREADY MODIFIED
#SO, SIMPLE RERUN YOUR TRAIN_TEST_SPLIT CODE AND NEXT IF YOU TRY WITH THE GIVEN CODE, IT WILL WORK

#Hope You got the Point !!!!!!

In [41]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

**Naive Bayes**

In [44]:
#import the necessary module
from sklearn.model_selection import train_test_split

data_tfidf = tfidf_transformer.transform(bow_data)

#split data set into train and test sets
data_tfidf_train, data_tfidf_test, label_train, label_test = \
    train_test_split(data_tfidf, data["spam"], test_size=0.3, random_state=57)



In [45]:
data_tfidf_train = data_tfidf_train.A
data_tfidf_test = data_tfidf_test.A

from sklearn.naive_bayes import GaussianNB, MultinomialNB

#Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics  import accuracy_score

spam_detect_model = GaussianNB().fit(data_tfidf_train, label_train)
pred_test_MNB = spam_detect_model.predict(data_tfidf_test)
acc_MNB = accuracy_score(label_test, pred_test_MNB)
print(acc_MNB)

0.8983253588516746


**Decision Tree**

In [46]:
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
#Create a Decision Tree Classifier (using Entropy)
clf_entropy=DecisionTreeClassifier(criterion="entropy")

# Train the model using the training sets
clf_entropy.fit(data_tfidf_train, label_train)
pred_test_MNB = clf_entropy.predict(data_tfidf_test)
acc_MNB = accuracy_score(label_test, pred_test_MNB)
print(acc_MNB)

0.9485645933014354
