####**Importing necessary libraries**

In [19]:
import re
import string
import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize

####**Loading Data**

In [3]:
df = pd.read_csv("/content/SMSSpamCollection", sep="\t", header=None)

print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
cols = ['label', 'context']
df.columns = cols

In [6]:
df.head()

Unnamed: 0,label,context
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


####**Feature Engineering**

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   context  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
df.isnull().sum()

label      0
context    0
dtype: int64

In [13]:
df['label'].value_counts()

# we can see it's kind of an imbalanced dataset but in this tutorial we don't care, we just want to learn an end to end nlp project

ham     4825
spam     747
Name: label, dtype: int64

In [10]:
df['length'] = df['context'].apply(lambda x: len(x))
df.head()

Unnamed: 0,label,context,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [11]:
# calculating max length of sentences
lens = []
for sentence in df['context']:
    lens.append(len(sentence))

print(max(lens))

910


####**Text Cleaning**

In [21]:
def convert_to_lower(text):
    return text.lower()

def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

In [24]:
df['context'] = df['context'].apply(lambda x: convert_to_lower(x))
df['context'] = df['context'].apply(lambda x: remove_numbers(x))
df['context'] = df['context'].apply(lambda x: remove_punctuation(x))
df['context'] = df['context'].apply(lambda x: remove_stopwords(x))
df['context'] = df['context'].apply(lambda x: remove_extra_white_spaces(x))
df['context'] = df['context'].apply(lambda x: lemmatizing(x))

In [26]:
df['length_after_cleaning'] = df['context'].apply(lambda x: len(x))
df.head()

Unnamed: 0,label,context,length,length_after_cleaning
0,ham,go jurong point crazy available bugis great wo...,111,78
1,ham,ok lar joking wif oni,29,21
2,spam,free entry wkly comp win fa cup final tkts st ...,155,101
3,ham,u dun say early hor c already say,49,33
4,ham,nah dont think go usf life around though,61,40


####**Vectorization**

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [29]:
# converting labels to numbers

label_map = {
    'ham': 0,
    'spam': 1,
}

df['label'] = df['label'].map(label_map)
df.head()

Unnamed: 0,label,context,length,length_after_cleaning
0,0,go jurong point crazy available bugis great wo...,111,78
1,0,ok lar joking wif oni,29,21
2,1,free entry wkly comp win fa cup final tkts st ...,155,101
3,0,u dun say early hor c already say,49,33
4,0,nah dont think go usf life around though,61,40


In [30]:
# COUNT VECTORIZATION

cv = CountVectorizer()
X_cv = cv.fit_transform(df['context'])
X_cv = X_cv.toarray()

In [31]:
# NGRAM VECTORIZATION

cv_ngram = CountVectorizer(ngram_range=(1,3))
X_cv_ngram = cv_ngram.fit_transform(df['context'])
X_cv_ngram = X_cv_ngram.toarray()

In [32]:
# TFIDF VECTORIZATION

tf = TfidfVectorizer()
X_tf = tf.fit_transform(df['context'])
X_tf = X_tf.toarray()

In [33]:
X_cv.shape, X_cv_ngram.shape, X_tf.shape

((5572, 7906), (5572, 68742), (5572, 7906))

####**Splitting data**

In [37]:
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_cv, df['label'].values, test_size=0.2)
X_train_ngram, X_test_ngram, y_train_ngram, y_test_ngram = train_test_split(X_cv_ngram, df['label'].values, test_size=0.2)
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X_tf, df['label'].values, test_size=0.2)

####**Machine Learning Model**

In [38]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

**using Count vectorized matrix**

In [39]:
naiveBayes = GaussianNB()
naiveBayes.fit(X_train_cv, y_train_cv)

GaussianNB(priors=None, var_smoothing=1e-09)

In [41]:
y_pred_cv = naiveBayes.predict(X_test_cv)

print(accuracy_score(y_test_cv, y_pred_cv))

0.8762331838565023


**using n-gram vectorized matrix**

In [42]:
naiveBayes.fit(X_train_ngram, y_train_ngram)

y_pred_ngram = naiveBayes.predict(X_test_ngram)

print(accuracy_score(y_test_ngram, y_pred_ngram))

0.9264573991031391


**using tfidf vectorized matrix**

In [43]:
naiveBayes.fit(X_train_tf, y_train_tf)

y_pred_tf = naiveBayes.predict(X_test_tf)

print(accuracy_score(y_test_tf, y_pred_tf))

0.8807174887892377


####**Therefore, we got around 93% accuracy with Ngram Document Matrix**