In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]


In [2]:
datasets = pd.read_csv('spam1.csv') 

datasets = datasets.rename(columns={"v1":"label", "v2":"text"})
datasets['label'] = datasets['label'].replace({'spam':1, 'ham':0})
datasets.head()

Unnamed: 0,label,text
0,1,Free entry in 2 a wkly comp to win FA Cup fina...
1,1,FreeMsg Hey there darling it's been 3 week's n...
2,1,WINNER!! As a valued network customer you have...
3,1,Had your mobile 11 months or more? U R entitle...
4,1,"SIX chances to win CASH! From 100 to 20,000 po..."


In [3]:
datasets.describe()

Unnamed: 0,label
count,513.0
mean,0.335283
std,0.47255
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [4]:
datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 513 entries, 0 to 512
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   513 non-null    int64 
 1   text    513 non-null    object
dtypes: int64(1), object(1)
memory usage: 8.1+ KB


In [5]:
datasets['label'].value_counts() # 1-> spam, 0->ham

0    341
1    172
Name: label, dtype: int64

##Analysis

To analyze the text data, we have to turn the words into numerical numbers. 
We have multiple choices to accomplish this step: 

1) Binary Term Frequency :  count presence(1) or absence(0) for term in document

2) Bag of Words Frequency:  captures the frequency of term in document

3) Term Frequency: 

4) TFIDF :

in this way, if a term appears frequently in a document, it’s important; if a term appears in many documents, it’s not a unique identifier.

Word2Vec.

In [6]:
text = datasets.iloc[:,1]

# converting to lower case
lower_case_text = []
lower_case_text = [d.lower() for d in text]

# removing punctuations
sans_punctuation_text = []
import string
for i in lower_case_text:
    sans_punctuation_text.append(i.translate(str.maketrans("","", string.punctuation)))

# tokenization
preprocessed_text = [[w for w in d.split()] for d in sans_punctuation_text]

# count frequencies
frequency_list = []
import pprint
from collections import Counter

frequency_list = [Counter(d) for d in preprocessed_text]
pprint.pprint(frequency_list)

[Counter({'to': 3,
          'entry': 2,
          'fa': 2,
          'free': 1,
          'in': 1,
          '2': 1,
          'a': 1,
          'wkly': 1,
          'comp': 1,
          'win': 1,
          'cup': 1,
          'final': 1,
          'tkts': 1,
          '21st': 1,
          'may': 1,
          '2005': 1,
          'text': 1,
          '87121': 1,
          'receive': 1,
          'questionstd': 1,
          'txt': 1,
          'ratetcs': 1,
          'apply': 1,
          '08452810075over18s': 1}),
 Counter({'to': 2,
          'freemsg': 1,
          'hey': 1,
          'there': 1,
          'darling': 1,
          'its': 1,
          'been': 1,
          '3': 1,
          'weeks': 1,
          'now': 1,
          'and': 1,
          'no': 1,
          'word': 1,
          'back': 1,
          'id': 1,
          'like': 1,
          'some': 1,
          'fun': 1,
          'you': 1,
          'up': 1,
          'for': 1,
          'it': 1,
          'still': 1,
       

#Next we use CountVectorizer:

More Details and example at:

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

count_vector.fit(text)
count_vector.get_feature_names()

doc_array = count_vector.transform(text).toarray()

frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,00,000,0121,02,0207,02073162414,021,03,04,050703,...,yourinclusive,yours,yourself,yoville,yr,yrs,yuo,yup,zed,zouk
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
509,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
510,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
511,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from sklearn.model_selection import train_test_split

#split data set into train and test sets
train_x, test_x, train_y, test_y = train_test_split(datasets['text'], datasets['label'], test_size = 0.2, random_state = 2) # roll no 2 and 80%-20% split

In [9]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(train_x)

# Transform testing data and return the matrix. 
testing_data = count_vector.transform(test_x)

**Naive Bayes**

In [10]:
from sklearn.naive_bayes import MultinomialNB #it gives more promising result and data is repeating
naive_bayes = MultinomialNB()
final_model = naive_bayes.fit(training_data,train_y)

In [11]:
#making prediction
prediction = naive_bayes.predict(testing_data)
print("prediction:",prediction)

prediction: [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 0 1 0 0 0
 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0
 0 0 0 0 1 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0]


In [12]:
print('Accuracy score: {}'.format(accuracy_score(test_y, prediction)))
print("Confusion Matrix: \n{}".format(confusion_matrix(test_y, prediction)))
print('Precision score: {}'.format(precision_score(test_y, prediction)))
print('Recall score: {}'.format(recall_score(test_y, prediction)))

Accuracy score: 0.970873786407767
Confusion Matrix: 
[[75  0]
 [ 3 25]]
Precision score: 1.0
Recall score: 0.8928571428571429


**Decision Tree**

In [13]:
from sklearn.tree import DecisionTreeClassifier

#Create a Decision Tree Classifier (using Gini)
DT = DecisionTreeClassifier(max_leaf_nodes = 20,criterion = "gini")

#Train the model using the training sets
final_model = DT.fit(training_data, train_y)

In [14]:
prediction = DT.predict(testing_data)
print("prediction:",prediction)

prediction: [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0
 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 1 1 1 1 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0]


In [15]:
print('Accuracy score: {}'.format(accuracy_score(test_y, prediction)))
print("Confusion Matrix: \n{}".format(confusion_matrix(test_y, prediction)))
print('Precision score: {}'.format(precision_score(test_y, prediction)))
print('Recall score: {}'.format(recall_score(test_y, prediction)))

Accuracy score: 0.883495145631068
Confusion Matrix: 
[[70  5]
 [ 7 21]]
Precision score: 0.8076923076923077
Recall score: 0.75


**Optional Exercise:**
Try this on full spam.csv file and bigram matching instead of unigram matching 