# NLP

In [1]:
# PIPELINE

# 1. Raw text - model can't distinguish the words.
# 2. Tokenize - Split the words so the model knows what to look at
# 3. Clean text - remove stopwords, punctuation, lemmatization, etc.
# 4. Vectorize - converting word occurence to numeric form
# 5. ML - train the model on the vectorized data.

## Stemming

In [2]:
# We use the porter stemmer - used to stem the words to the root.
import nltk
#dir(nltk)
porter_stemmer = nltk.PorterStemmer()

In [3]:
list_words = ['grows', 'growing', 'grower', 'growing']

for each_word in list_words:
    
    print(each_word, " ----> ", porter_stemmer.stem(each_word))

grows  ---->  grow
growing  ---->  grow
grower  ---->  grower
growing  ---->  grow


In [4]:
import os
import sys

sys.path.insert(0, r"F:\abc\abc\R")
import pandas as pd
import re
import string

In [5]:
df = pd.read_csv(os.path.join(r"F:\abc\abc\R","SMSSpamCollection.tsv"), sep = '\t')
df.columns = ['class', 'text']



In [6]:
#Loading DF

In [7]:
pd.set_option('display.max_colwidth', 100)

In [8]:
df.head()

Unnamed: 0,class,text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


## Regular Expressions

In [9]:
# `nlp` ---> searches for all patterns that contain the word "nlp"
# `[j-q]` ---> searches for all single charactrers between j & q. (Also including j & q)
# `[j-q]+` ---> searches for all charactrers between j & q (both included). Length can  be more than 1
# '[0-9]+' ---> searches for all numbers. Example : used to search 2018
# `[j-q0-9]+` ---> searches for sequences of chars between j & q OR numbers between 0-9. Ex: nlp2018
# '\s' ----> look for a single white space
# \s+ --> Look for a multiple white spaces.
# \W+ ---> search for non word characters
# \w+ ---> search for one or more WORD characters
# \S+ ---> Search for one or more NON WHITE SPACE characters
# S -> space, w -> words :)

# '[A-Z]+[0-9]+' ---> Finds multiple characters between A-Z AND numbers 0-9 as well (both)
# \s? ---> this means that the space is optional.

In [10]:
#Splitting a sentence into a list of words
re_test = 'This is a made up string to test 2 different regex methods'
re_test_messy = 'This    is a made up     string to test 2    different regex methods'
re_test_messy_2 = 'This-is-a-made/up.string*to&>>>.test---2""""different`~regex-methods'


In [11]:
# \s --> Look for a single white space.
print("using Split \s\n")
print(re.split('\s', re_test))
print(re.split('\s', re_test_messy))
print(re.split('\s', re_test_messy_2))

# \s+ --> Look for a multiple white spaces.
print("\n using Split \s+      \n")
print(re.split('\s+', re_test))
print(re.split('\s+', re_test_messy))
print(re.split('\s+', re_test_messy_2))

# \W+ ---> search for non word characters
print("\n using Split \W+      \n")
print(re.split('\W+', re_test))
print(re.split('\W+', re_test_messy))
print(re.split('\W+', re_test_messy_2))

# \W+ ---> search for non word characters
print("\n using findall \w+      \n")
print(re.findall('\w+', re_test))
print(re.findall('\w+', re_test_messy))
print(re.findall('\w+', re_test_messy_2))

# re.split() and re.findall() are opposite of each other.
# re.split() splits on our regex pattern
# re.findall() finds our regex pattern.
# \s for re.split() == \S for re.findall()

using Split \s

['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']
['This', '', '', '', 'is', 'a', 'made', 'up', '', '', '', '', 'string', 'to', 'test', '2', '', '', '', 'different', 'regex', 'methods']
['This-is-a-made/up.string*to&>>>.test---2""""different`~regex-methods']

 using Split \s+      

['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']
['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']
['This-is-a-made/up.string*to&>>>.test---2""""different`~regex-methods']

 using Split \W+      

['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']
['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']
['This', 'is', 'a', 'made', 'up', 'string', 'to', 'test', '2', 'different', 'regex', 'methods']

 using findall \w+      

['This', 'is', 'a', 'made', 'up', 'string', 'to',

In [12]:
# Find and Replace.

pep8 = "I try to follow PEP8 guidelines"
pep8_2 = "I try to follow PEP75 guidelines"
pep8_3 = "I try to follow PEEP8 guidelines"
pep8_4 = "I try to follow PEEP 8 guidelines"

# Replace the incorrect spellings of PEP8 to PEP9.

print(re.sub('[A-Z]+[0-9]+', 'PEP9', pep8))
print(re.sub('[A-Z]+[0-9]+', 'PEP9', pep8_2))
print(re.sub('[A-Z]+[0-9]+', 'PEP9', pep8_3))
print(re.sub(r'\b(PEEP?|\bPE\b)(\s?\d+)?\b', 'PEP9', pep8_4))

print(re.sub('[A-Z]+\s?+[0-9]+','hello', pep8_3))

# Explanation of \b(PEEP?|\bPE\b)(\s?\d+)?\b

'''
Breakdown of the Corrected Regex Pattern:
\b: Word Boundary

Ensures that "PEP", "PEEP", or "PE" is a whole word and not part of another word.
(PEEP?|\bPE\b):

PEEP?: Matches "PEEP" with the last "P" being optional.
|\bPE\b: Matches "PE" as a whole word.
This part of the pattern matches either "PEEP" (with optional "P") or "PE" as a whole word.
(\s?\d+)?:

(\s?\d+): Optionally matches a space followed by one or more digits.
?: Makes the entire group optional, allowing for an optional space followed by one or more 
digits after "PEEP", "PE", or "PEP".
\b: Word Boundary

Ensures that the number is a whole word and not part of another word.
'''
#

I try to follow PEP9 guidelines
I try to follow PEP9 guidelines
I try to follow PEP9 guidelines
I try to follow PEP9 guidelines
I try to follow hello guidelines


'\nBreakdown of the Corrected Regex Pattern:\n\x08: Word Boundary\n\nEnsures that "PEP", "PEEP", or "PE" is a whole word and not part of another word.\n(PEEP?|\x08PE\x08):\n\nPEEP?: Matches "PEEP" with the last "P" being optional.\n|\x08PE\x08: Matches "PE" as a whole word.\nThis part of the pattern matches either "PEEP" (with optional "P") or "PE" as a whole word.\n(\\s?\\d+)?:\n\n(\\s?\\d+): Optionally matches a space followed by one or more digits.\n?: Makes the entire group optional, allowing for an optional space followed by one or more \ndigits after "PEEP", "PE", or "PEP".\n\x08: Word Boundary\n\nEnsures that the number is a whole word and not part of another word.\n'

## Remove punctuation

In [13]:
stopwords = nltk.corpus.stopwords.words('english')

In [14]:
def remove_punctuation(column_data):
    column_data = "".join([word for word in column_data if word not in string.punctuation])
    column_data = column_data.lower()
    #tokenize
    column_data = re.split("\W+", column_data)
    #remove stopwords
    column_data = [word for word in column_data if word not in stopwords]
    return column_data
    
df['tokenized_text'] = df['text'].apply(remove_punctuation)
df
    

Unnamed: 0,class,text,tokenized_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr..."
...,...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy...,"[2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1..."
5563,ham,Will ü b going to esplanade fr home?,"[ü, b, going, esplanade, fr, home]"
5564,ham,"Pity, * was in mood for that. So...any other suggestions?","[pity, mood, soany, suggestions]"
5565,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week ...,"[guy, bitching, acted, like, id, interested, buying, something, else, next, week, gave, us, free]"


# STEMMING

In [15]:
# PORTER STEMMING

def my_stemmer(column_data):
    column_data = [porter_stemmer.stem(word) for word in column_data]
    return column_data

df['stemmed'] = df['tokenized_text'].apply(my_stemmer)

In [16]:
df.head()

Unnamed: 0,class,text,tokenized_text,stemmed
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr...","[per, request, mell, mell, oru, minnaminungint, nurungu, vettam, set, callertun, caller, press, ..."


# LEMMATIZATION

In [17]:
# Stemming - Algo that directly chops off the suffix of the word.
# Lemmatization - Always returns a dictionary word. It understands the relations more.

In [18]:
my_lemmatizer = nltk.WordNetLemmatizer()



In [19]:
list_words = ['grows', 'growing', 'grower', 'growing', 'goose', 'geese']

for each_word in list_words:
    
    print(each_word, " ----> ", porter_stemmer.stem(each_word), "STEMMED")
    print(each_word, " ----> ", my_lemmatizer.lemmatize(each_word), "LEMMATIZED\n")

grows  ---->  grow STEMMED
grows  ---->  grows LEMMATIZED

growing  ---->  grow STEMMED
growing  ---->  growing LEMMATIZED

grower  ---->  grower STEMMED
grower  ---->  grower LEMMATIZED

growing  ---->  grow STEMMED
growing  ---->  growing LEMMATIZED

goose  ---->  goos STEMMED
goose  ---->  goose LEMMATIZED

geese  ---->  gees STEMMED
geese  ---->  goose LEMMATIZED



In [20]:
def lemmatizing_function(column_data):
    column_data = [my_lemmatizer.lemmatize(word) for word in column_data]
    return column_data

df['lemmatized'] = df['tokenized_text'].apply(lemmatizing_function)

df.head()

Unnamed: 0,class,text,tokenized_text,stemmed,lemmatized
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]","[even, brother, like, speak, treat, like, aid, patent]"
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]","[date, sunday]"
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr...","[per, request, mell, mell, oru, minnaminungint, nurungu, vettam, set, callertun, caller, press, ...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, caller, pre..."


## Vectorization 

In [21]:
# Process of encoding text as integers to create feature vectors.

In [22]:
# dfg
"""
1. Count Vectorization:

How it works:
Count Vectorization represents the text data by counting the frequency of each word in the document. 
Each document is represented as a vector where each element corresponds to the count of a particular 
word in the document.

Advantages:

Simple and easy to implement.
Captures the local importance of words within a document.

Disadvantages:

Ignores the order of words.
Can result in high-dimensional sparse vectors, which can be computationally expensive.

2. TF-IDF (Term Frequency-Inverse Document Frequency):
How it works:
TF-IDF measures the importance of a word in a document relative to a corpus. It considers both the 
frequency of the word in the document (TF) and the rarity of the word in the corpus (IDF).

TF-IDF(�, �) = TF(�,�)×IDF(�)
TF-IDF(t,d)=TF(t,d)×IDF(t)

TF (Term Frequency): Frequency of a term in a document.
IDF (Inverse Document Frequency): Logarithm of the total number of documents divided by the number 
of documents containing the term.

Advantages:

Accounts for the importance of words in a document relative to the entire corpus.
Reduces the weight of common words that are not informative.

Disadvantages:

Like Count Vectorization, it also ignores the order of words.
Can be sensitive to the quality of the corpus and the chosen parameters.

3. N-grams:

How it works:
N-grams are contiguous sequences of n items (words, characters, etc.) from a given sample of text. 
For example, in the sentence "I love machine learning," the 2-grams (or bigrams) are "I love," 
"love machine," and "machine learning."

Advantages:

Captures the local context and order of words.
Can capture more semantic meaning compared to individual words.

Disadvantages:

Increases the dimensionality of the data.
Can result in sparse representations for larger n.

Summary:
Count Vectorization is a straightforward method that counts the occurrences of words in a document.
TF-IDF weights words based on their frequency in a document relative to a corpus, aiming to 
emphasize informative words.

N-grams capture the local context and order of words, providing more semantic meaning but 
potentially leading to higher dimensionality.

Choosing the right text representation technique depends on the specific task, the nature of 
the data, and the computational resources available. Often, it's beneficial to experiment with 
different methods and evaluate their performance to determine the most suitable approach for 
a given problem.
"""
#

'\n1. Count Vectorization:\n\nHow it works:\nCount Vectorization represents the text data by counting the frequency of each word in the document. \nEach document is represented as a vector where each element corresponds to the count of a particular \nword in the document.\n\nAdvantages:\n\nSimple and easy to implement.\nCaptures the local importance of words within a document.\n\nDisadvantages:\n\nIgnores the order of words.\nCan result in high-dimensional sparse vectors, which can be computationally expensive.\n\n2. TF-IDF (Term Frequency-Inverse Document Frequency):\nHow it works:\nTF-IDF measures the importance of a word in a document relative to a corpus. It considers both the \nfrequency of the word in the document (TF) and the rarity of the word in the corpus (IDF).\n\nTF-IDF(�, �) = TF(�,�)×IDF(�)\nTF-IDF(t,d)=TF(t,d)×IDF(t)\n\nTF (Term Frequency): Frequency of a term in a document.\nIDF (Inverse Document Frequency): Logarithm of the total number of documents divided by the numb

### Count Vectorization 

In [23]:
# Create function to join lemmatized words in  a column and remove numbers.

def join_lemmatized(word_list):
    word = " ".join(word_list)
    word = re.sub('[0-9]+\S+', '', word)
    return word

df['lemmatized_joined'] = df['lemmatized'].apply(join_lemmatized)
df.head()

Unnamed: 0,class,text,tokenized_text,stemmed,lemmatized,lemmatized_joined
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...",free entry 2 wkly comp win fa cup final tkts may text fa receive entry questionstd txt ratetc...
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]",nah dont think go usf life around though
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]","[even, brother, like, speak, treat, like, aid, patent]",even brother like speak treat like aid patent
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]","[date, sunday]",date sunday
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr...","[per, request, mell, mell, oru, minnaminungint, nurungu, vettam, set, callertun, caller, press, ...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, caller, pre...",per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy fr...


In [24]:
# Creates a document-term matrix. Count of occurence of each word.

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(df['lemmatized_joined'])
print(X_counts.shape)
print(count_vect.get_feature_names_out())


(5567, 7763)
['aa' 'aah' 'aaniye' ... 'zyada' 'üll' '〨ud']


In [25]:
# Now that we have our count vectorizer, 
# it will be represented by a sparse matrix (matrix of 0s mostly) 
X_counts

<5567x7763 sparse matrix of type '<class 'numpy.int64'>'
	with 44692 stored elements in Compressed Sparse Row format>

In [26]:
# to read it, we convert it to array
# and save those arrays in a df.

count_vec_X_df = pd.DataFrame(X_counts.toarray())
count_vec_X_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7753,7754,7755,7756,7757,7758,7759,7760,7761,7762
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5562,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Column names can be assigned back by 
count_vec_X_df.columns = count_vect.get_feature_names_out()

count_vec_X_df.head()

Unnamed: 0,aa,aah,aaniye,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,üll,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## N Grams

In [28]:
# n combinations of adjacent words are formed in the matrix.
# "nlp is an interesting topic"
# bigram = ['nlp is', 'is an', 'an interesting', 'interesting topic']
# trigram = ['nlp is an', 'is an interesting', 'an interesting topic']
# 4 gram = ['nlp is an interesting', 'is an interesting topic']

# choose n by tuning.
# Count vectorizer = n gram (n =1)


In [29]:
# ngram also uses the count vectorizer for initialization
# ngram_range of (1, 1) means only unigrams, (1, 2) means
#     unigrams and bigrams, and (2, 2) means only bigrams. [min, max]

ngram_vector = CountVectorizer(ngram_range = (2,2))

X_counts = ngram_vector.fit_transform(df['lemmatized_joined'])
print(X_counts.shape)
print(ngram_vector.get_feature_names_out())

(5567, 29898)
['aa exhaust' 'aah bless' 'aah cuddle' ... 'üll submitting' 'üll take'
 '〨ud evening']


In [30]:
# Again, we will have a sparse matrix. 
ngram_vec_X_df = pd.DataFrame(X_counts.toarray())
ngram_vec_X_df.columns = ngram_vector.get_feature_names_out()
ngram_vec_X_df.head()

Unnamed: 0,aa exhaust,aah bless,aah cuddle,aah speak,aaniye pudunga,aaooooright work,aathilove lot,aathiwhere dear,ab sara,abbey happy,...,zoe hit,zoe join,zogtorius staring,zoom cine,zouk nichols,zyada kisi,üll finish,üll submitting,üll take,〨ud evening
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF - IDF 

In [31]:
# W(i,j) = tf(i,j) * log(N/df(i))
# tf(i,j) = no. of times i occurs in j divided by total number of terms in j
# df(i) = no. of documents containing i.
# N = total number of documents.

# Tells us how important a word is for a message. We get weights.

# -------------------------------------------------------------------- #

# for " i like NLP",
# tf(NLP, j) = (no. of occurences of NLP)/(no. of words in message) = 1/3 = 0.33
# N = 20
# df(NLP) = 1
# W(i, j) = 0.33 * log(20/1) = 0.33 * 1.301 = 0.43 (log uses base 10 for this ex. usually uses ln)



In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(df['lemmatized_joined'])

print(X_tfidf.shape)
print(tfidf_vect.get_feature_names_out())

(5567, 7763)
['aa' 'aah' 'aaniye' ... 'zyada' 'üll' '〨ud']


In [33]:
tfidf_vect_X_df = pd.DataFrame(X_tfidf.toarray())
tfidf_vect_X_df.columns = tfidf_vect.get_feature_names_out()
tfidf_vect_X_df.head()

Unnamed: 0,aa,aah,aaniye,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Feature Engineering

## Creating Features 

In [34]:
# Feature to calculate the length of each message.

df['message_length'] = df['text'].apply(lambda x: len(x) - x.count(" "))

# Feature to calculate what % of text is punctuation

def count_punctuation(text):
    total = 0
    counter = sum([1 for letter in text if letter in string.punctuation])
    
    return round(counter/(len(text) - text.count(" "))*100, 2) # no. of punctuations - no. of spaces

df['percent_punctuation'] = df['text'].apply(count_punctuation)

df.head()

Unnamed: 0,class,text,tokenized_text,stemmed,lemmatized,lemmatized_joined,message_length,percent_punctuation
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...",free entry 2 wkly comp win fa cup final tkts may text fa receive entry questionstd txt ratetc...,128,4.69
1,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]",nah dont think go usf life around though,49,4.08
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]","[even, brother, like, speak, treat, like, aid, patent]",even brother like speak treat like aid patent,62,3.23
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]","[date, sunday]",date sunday,28,7.14
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,"[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr...","[per, request, mell, mell, oru, minnaminungint, nurungu, vettam, set, callertun, caller, press, ...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, caller, pre...",per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy fr...,135,4.44


# Model Training

### Random Forest using TFIDF and K FOLD Cross Validation

In [35]:
X_features_tfidf = pd.concat([df['message_length'], df['percent_punctuation'], tfidf_vect_X_df], axis = 1)
Y_features = df['class']
X_features_tfidf.head()



Unnamed: 0,message_length,percent_punctuation,aa,aah,aaniye,aaooooright,aathilove,aathiwhere,ab,abbey,...,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,üll,〨ud
0,128,4.69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Skip to next heading

In [36]:
from sklearn.ensemble import RandomForestClassifier

print(dir(RandomForestClassifier))
print()
print(RandomForestClassifier())

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_build_request_for_signature', '_check_feature_names', '_check_n_features', '_compute_oob_predictions', '_estimator_type', '_get_default_requests', '_get_metadata_request', '_get_oob_predictions', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_parameter_constraints', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score_and_attributes', '_validate_X_predict', '_validate_data', '_validate_estimator', '_validate_params', '_validate_y_class_wei

In [37]:
help(RandomForestClassifier())

Help on RandomForestClassifier in module sklearn.ensemble._forest object:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  For a comparison between tree-based ensemble models see the example
 |  :ref:`s

In [38]:
from sklearn.model_selection import KFold, cross_val_score

# K Fold Cross Validation splits the data into n folds. Then uses each as the test once.
# If K = 5, it will train the model 5 times and tell us the accuracy for each fold.

In [39]:
rf = RandomForestClassifier(n_jobs = -1)

k_fold = KFold(n_splits = 5)

cross_val_score(rf, X_features_tfidf, Y_features, cv = k_fold, scoring = "accuracy", n_jobs = -1)

array([0.97755835, 0.97755835, 0.97484277, 0.9640611 , 0.97574124])

In [40]:
# Training the full RF model

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [41]:
X_train_tfidf, X_test_tfidf, Y_train_tfidf, Y_test_tfidf = train_test_split(X_features_tfidf,
                                                                           Y_features,
                                                                            test_size = 0.2)

In [42]:
import time

In [43]:
start_time = time.time()
rf = RandomForestClassifier(n_estimators = 50, max_depth = 20, n_jobs = -1) 
rf_model = rf.fit(X_train_tfidf, Y_train_tfidf)

print("Time taken to train RF Model : {} seconds".format(round(time.time()-start_time, 2)))
# Training with maximum 50 trees.
# the max depth of any tree can be 20.
# Use all processors in parallel.


Time taken to train RF Model : 1.08 seconds


In [44]:
# Feature importance

importance_data = zip(rf_model.feature_importances_, X_train_tfidf.columns)
print(sorted(importance_data, reverse = True)[:10])

[(0.06081476528508346, 'message_length'), (0.04924806970136537, 'txt'), (0.0484582569145438, 'call'), (0.044125091970542735, 'free'), (0.04262659364571766, 'claim'), (0.02770849167078579, 'mobile'), (0.024401899791802344, 'reply'), (0.023592663647649517, 'text'), (0.02029517194134667, 'service'), (0.01687128687549699, 'pobox')]


In [45]:
# Checking accuracy

y_pred_tfidf = rf_model.predict(X_test_tfidf)

precision, recall, fscore, support = score(Y_test_tfidf, 
                                        y_pred_tfidf, 
                                        pos_label = 'spam', 
                                        average = 'binary' )

print(" Precision : {} \n Recall : {} \n Accuracy : {} "
      .format(round(precision, 2),
                    round(recall, 2),
                    round((y_pred_tfidf==Y_test_tfidf).sum() / len(y_pred_tfidf)), 3))


 Precision : 1.0 
 Recall : 0.62 
 Accuracy : 1 


###  GridCVSearch to find best parameters

In [46]:
# We already have X_features_tfidf
# Creating X_features_n_grams for n = 2 and X_features_count

X_features_n_gram = pd.concat([df['message_length'], df['percent_punctuation'], ngram_vec_X_df],
                              axis =1)

X_features_count = pd.concat([df['message_length'], df['percent_punctuation'], count_vec_X_df],
                              axis =1)

In [47]:
from sklearn.model_selection import GridSearchCV


In [49]:
# GridSearch CV for TFIDF

start_time = time.time()
rf = RandomForestClassifier()

params = {
    'n_estimators' : [10,20, 90, 200],
    'max_depth' : [10, 30, 90, None]
}

gs = GridSearchCV(rf, params, cv =5, n_jobs = -1)
gs_fit = gs.fit(X_features_tfidf, Y_features)

# Printing DF of GridCVSearch results to pick the best value.
tfidf_cv_result_df = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', 
                                                                  ascending = False)[:5]
# Storing best params to train model later.
best_params_tfidf = gs_fit.best_params_

print("Best parameters are : ", gs_fit.best_params_)
print("Time taken to complete CV Search = {} seconds".format(round(time.time()-start_time, 2)))

tfidf_cv_result_df

Best parameters are :  {'max_depth': 90, 'n_estimators': 20}
Time taken to complete CV Search = 188.11 seconds


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,6.399018,0.269378,0.360723,0.132468,90.0,20,"{'max_depth': 90, 'n_estimators': 20}",0.981149,0.973968,0.979335,0.967655,0.97664,0.975749,0.004722,1
15,51.249118,2.499986,0.418138,0.044076,,200,"{'max_depth': None, 'n_estimators': 200}",0.981149,0.973968,0.97664,0.971249,0.975741,0.975749,0.003267,1
10,23.896127,0.999816,0.47625,0.032773,90.0,90,"{'max_depth': 90, 'n_estimators': 90}",0.982047,0.97307,0.978437,0.97035,0.973944,0.97557,0.004155,3
14,28.091282,0.599448,0.489066,0.124741,,90,"{'max_depth': None, 'n_estimators': 90}",0.977558,0.97307,0.97664,0.97035,0.978437,0.975211,0.003039,4
11,51.376236,0.515474,0.601853,0.057752,90.0,200,"{'max_depth': 90, 'n_estimators': 200}",0.981149,0.971275,0.974843,0.967655,0.973944,0.973773,0.004455,5


In [50]:
# GridSearch CV for Count Vectorization

start_time = time.time()
rf = RandomForestClassifier()

params = {
    'n_estimators' : [10,20, 90, 200],
    'max_depth' : [10, 30, 90, None]
}

gs = GridSearchCV(rf, params, cv =5, n_jobs = -1)
gs_fit = gs.fit(X_features_count, Y_features)

# Printing DF of GridCVSearch results to pick the best value.
count_cv_result_df = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', 
                                                                  ascending = False)[:5]
# Storing best params to train model later.
best_params_count_vec = gs_fit.best_params_


print("Best parameters are : ", gs_fit.best_params_)
print("Time taken to complete CV Search = {} seconds".format(round(time.time()-start_time, 2)))

count_cv_result_df

Best parameters are :  {'max_depth': 90, 'n_estimators': 10}
Time taken to complete CV Search = 187.24 seconds


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,4.426046,0.2851,0.368868,0.064689,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.978456,0.976661,0.975741,0.97035,0.974843,0.97521,0.002708,1
9,6.645732,0.223943,0.29202,0.041871,90.0,20,"{'max_depth': 90, 'n_estimators': 20}",0.982047,0.974865,0.974843,0.969452,0.974843,0.97521,0.004007,2
10,23.991685,0.511405,0.494475,0.07459,90.0,90,"{'max_depth': 90, 'n_estimators': 90}",0.980251,0.970377,0.97664,0.969452,0.97664,0.974672,0.004113,3
15,48.373144,1.90864,0.440557,0.0396,,200,"{'max_depth': None, 'n_estimators': 200}",0.980251,0.973968,0.97664,0.968553,0.973944,0.974671,0.003833,4
14,28.689256,0.596009,0.44341,0.050243,,90,"{'max_depth': None, 'n_estimators': 90}",0.978456,0.976661,0.975741,0.967655,0.974843,0.974671,0.003706,5


In [51]:
# GridSearch CV for N Gram Vectorization

start_time = time.time()
rf = RandomForestClassifier()

params = {
    'n_estimators' : [10,20, 90, 200],
    'max_depth' : [10, 30, 90, None]
}

gs = GridSearchCV(rf, params, cv =5, n_jobs = None)
gs_fit = gs.fit(X_features_n_gram, Y_features)

# Printing DF of GridCVSearch results to pick the best value.
ngram_cv_result_df = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', 
                                                                  ascending = False)[:5]
# Storing best params to train model later.
best_params_ngram = gs_fit.best_params_

print("Best parameters are : ", gs_fit.best_params_)
print("Time taken to complete CV Search = {} seconds".format(round(time.time()-start_time, 2)))

ngram_cv_result_df

Best parameters are :  {'max_depth': None, 'n_estimators': 90}
Time taken to complete CV Search = 2208.64 seconds


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
14,66.459179,2.932381,0.886601,0.030912,,90,"{'max_depth': None, 'n_estimators': 90}",0.956912,0.941652,0.944295,0.941599,0.940701,0.945032,0.00606,1
15,143.466651,4.250837,1.143675,0.073827,,200,"{'max_depth': None, 'n_estimators': 200}",0.951526,0.939856,0.943396,0.941599,0.94699,0.944674,0.004162,2
13,17.398925,0.794697,0.746531,0.049003,,20,"{'max_depth': None, 'n_estimators': 20}",0.948833,0.944345,0.943396,0.937107,0.947889,0.944314,0.004147,3
12,9.558334,0.297997,0.67874,0.027734,,10,"{'max_depth': None, 'n_estimators': 10}",0.951526,0.938061,0.944295,0.934412,0.94699,0.943057,0.006139,4
11,58.53046,0.683217,0.799013,0.017196,90.0,200,"{'max_depth': 90, 'n_estimators': 200}",0.936266,0.933573,0.930818,0.926325,0.931716,0.931739,0.003287,5


## training models for all 3 cases.

In [52]:


from sklearn.metrics import accuracy_score, confusion_matrix

In [53]:
# Split X train and Y train and X test and Y test for count vec and ngram

X_train_count_vec, X_test_count_vec, Y_train_count_vec, Y_test_count_vec = train_test_split(
                                                                            X_features_count,
                                                                           Y_features,
                                                                            test_size = 0.2)

X_train_ngram, X_test_ngram, Y_train_ngram, Y_test_ngram = train_test_split(X_features_n_gram,
                                                                           Y_features,
                                                                            test_size = 0.2)

# Remove
try:
    x = list(best_params_ngram.values())
except:
    best_params_ngram = {'max_depth': None, 'n_estimators': 90}


In [54]:
# train tfidf
tfidf_rf = RandomForestClassifier(n_estimators = best_params_tfidf.get('n_estimators'),
                                  max_depth = best_params_tfidf.get('max_depth'), n_jobs = -1) 

tfidf_rf_model = tfidf_rf.fit(X_train_tfidf, Y_train_tfidf)

# train count vec
count_vec_rf = RandomForestClassifier(n_estimators = best_params_count_vec.get('n_estimators'),
                                  max_depth = best_params_count_vec.get('max_depth'), n_jobs = -1) 

count_vec_rf_model = count_vec_rf.fit(X_train_count_vec, Y_train_count_vec)

# train ngram
ngram_rf = RandomForestClassifier(n_estimators = best_params_ngram.get('n_estimators'),
                                  max_depth = best_params_ngram.get('max_depth'), n_jobs = -1) 

ngram_rf_model = ngram_rf.fit(X_train_ngram, Y_train_ngram)

In [55]:
# Predictions

y_pred_tfidf = tfidf_rf_model.predict(X_test_tfidf)
y_pred_count_vec = count_vec_rf_model.predict(X_test_count_vec)
y_pred_ngram = ngram_rf_model.predict(X_test_ngram)

# precision, recall, fscore, support 
tfidf_result_list = score(Y_test_tfidf, 
                        y_pred_tfidf, 
                        pos_label = 'spam', 
                        average = 'binary' )
count_vec_result_list = score(Y_test_count_vec, 
                        y_pred_count_vec, 
                        pos_label = 'spam', 
                        average = 'binary' )

ngram_result_list = score(Y_test_ngram, 
                        y_pred_ngram, 
                        pos_label = 'spam', 
                        average = 'binary' )

def accuracy_printer(y_pred, y_test):
    return round((y_pred==y_test).sum() / len(y_pred), 3)

print("TFIDF Precision : {} \n Recall : {} \n Accuracy : {} "
      .format(round(tfidf_result_list[0], 2),
                    round(tfidf_result_list[1], 2),
                    accuracy_printer(y_pred_tfidf, Y_test_tfidf)))
      
print("\n\nCOUNT VECTORIZATION Precision : {} \n Recall : {} \n Accuracy : {} "
      .format(round(count_vec_result_list[0], 2),
                    round(count_vec_result_list[1], 2),
                    accuracy_printer(y_pred_count_vec, Y_test_count_vec)))
      
print("\n\nNGRAM Precision : {} \n Recall : {} \n Accuracy : {} "
      .format(round(ngram_result_list[0], 2),
                    round(ngram_result_list[1], 2),
                    accuracy_printer(y_pred_ngram, Y_test_ngram)))

TFIDF Precision : 1.0 
 Recall : 0.83 
 Accuracy : 0.978 


COUNT VECTORIZATION Precision : 0.98 
 Recall : 0.83 
 Accuracy : 0.974 


NGRAM Precision : 0.95 
 Recall : 0.69 
 Accuracy : 0.959 


# Gradient Boosting

In [56]:
from sklearn.ensemble import GradientBoostingClassifier

In [57]:
gb = GradientBoostingClassifier()

param = {
    "n_estimators" : [100,150],
    "max_depth" : [7,11,15],
    "learning_rate" : [0.1, 0.15]
}

grid_search = GridSearchCV(gb, param, cv = 5, n_jobs = -1)



In [58]:
#TFIDF

start_time = time.time()
cv_fit_tfidf = grid_search.fit(X_features_tfidf, Y_features)
best_params_tfidf= cv_fit_tfidf.best_params_
print("Best parameters are : ", cv_fit_tfidf.best_params_)
print("TFIDF Time taken to complete CV Search = {} seconds".format(round(time.time()-start_time, 2)))



Best parameters are :  {'learning_rate': 0.15, 'max_depth': 7, 'n_estimators': 150}
TFIDF Time taken to complete CV Search = 2161.72 seconds


In [59]:
# COUNT VECTORIZATION

start_time = time.time()
cv_fit_count = grid_search.fit(X_features_count, Y_features)
best_params_count= cv_fit_count.best_params_
print("Best parameters are : ", cv_fit_count.best_params_)
print("COUNT VECT Time taken to complete CV Search = {} seconds".format(
    round(time.time()-start_time, 2)))



Best parameters are :  {'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 150}
COUNT VECT Time taken to complete CV Search = 2061.7 seconds


In [62]:
# N GRAMS

grid_search = GridSearchCV(gb, param, cv = 5, n_jobs = 2)

start_time = time.time()
cv_fit_ngram = grid_search.fit(X_features_n_gram, Y_features)
best_params_ngram_boosting = cv_fit_ngram.best_params_
print("Best parameters are : ", cv_fit_ngram.best_params_)
print("NGRAM Time taken to complete CV Search = {} seconds".format(
    round(time.time()-start_time, 2)))

KeyboardInterrupt: 

In [65]:
try:
    print(best_params_tfidf.values())
except:
    best_params_tfidf = {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}
    
try:
    print(best_params_count.values())
except:
    best_params_count = {'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 150}
    
try:
    print(best_params_ngram_boosting.values())
except:
    best_params_ngram_boosting = {'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 150}

dict_values([0.15, 7, 150])
dict_values([0.1, 11, 150])


In [69]:
# Model training on best parameters.

#TFIDF
start_time = time.time()
boosting_object = GradientBoostingClassifier(learning_rate = best_params_tfidf.get('learning_rate'),
                                            max_depth = best_params_tfidf.get('max_depth'),
                                            n_estimators = best_params_tfidf.get('n_estimators'))

tfidf_model = boosting_object.fit(X_train_tfidf, Y_train_tfidf)
print("TFIDF Training Time --> {} seconds".format(round(time.time()-start_time, 3)))



TFIDF Training Time --> 214.906 seconds


In [72]:
# Count Vectorization
start_time = time.time()
boosting_object = GradientBoostingClassifier(learning_rate = best_params_count.get('learning_rate'),
                                            max_depth = best_params_count.get('max_depth'),
                                            n_estimators = best_params_count.get('n_estimators'))

count2__model = boosting_object.fit(X_train_count_vec, Y_train_count_vec)
print("COUNT Training Time --> {} seconds".format(round(time.time()-start_time, 3)))

COUNT Training Time --> 152.615 seconds


In [71]:
# NGRAM Vectorization
start_time = time.time()
boosting_object = GradientBoostingClassifier(learning_rate = best_params_ngram_boosting.get('learning_rate'),
                                            max_depth = best_params_ngram_boosting.get('max_depth'),
                                            n_estimators = best_params_ngram_boosting.get('n_estimators'))

count_model = boosting_object.fit(X_train_ngram, Y_train_ngram)
ngram_model = count_model
print("NGRAM Training Time --> {} seconds".format(round(time.time()-start_time, 3)))

NGRAM Training Time --> 536.814 seconds


In [73]:
# Predictions
ngram_model = count_model
y_pred_tfidf = tfidf_model.predict(X_test_tfidf)
y_pred_count_vec = count2__model.predict(X_test_count_vec)
y_pred_ngram = ngram_model.predict(X_test_ngram)

# precision, recall, fscore, support 
tfidf_result_list = score(Y_test_tfidf, 
                        y_pred_tfidf, 
                        pos_label = 'spam', 
                        average = 'binary' )
count_vec_result_list = score(Y_test_count_vec, 
                        y_pred_count_vec, 
                        pos_label = 'spam', 
                        average = 'binary' )

ngram_result_list = score(Y_test_ngram, 
                        y_pred_ngram, 
                        pos_label = 'spam', 
                        average = 'binary' )

def accuracy_printer(y_pred, y_test):
    return round((y_pred==y_test).sum() / len(y_pred), 3)

print("TFIDF Precision : {} \n Recall : {} \n Accuracy : {} "
      .format(round(tfidf_result_list[0], 2),
                    round(tfidf_result_list[1], 2),
                    accuracy_printer(y_pred_tfidf, Y_test_tfidf)))
      
print("\n\nCOUNT VECTORIZATION Precision : {} \n Recall : {} \n Accuracy : {} "
      .format(round(count_vec_result_list[0], 2),
                    round(count_vec_result_list[1], 2),
                    accuracy_printer(y_pred_count_vec, Y_test_count_vec)))
      
print("\n\nNGRAM Precision : {} \n Recall : {} \n Accuracy : {} "
      .format(round(ngram_result_list[0], 2),
                    round(ngram_result_list[1], 2),
                    accuracy_printer(y_pred_ngram, Y_test_ngram)))

TFIDF Precision : 0.93 
 Recall : 0.86 
 Accuracy : 0.973 


COUNT VECTORIZATION Precision : 0.94 
 Recall : 0.87 
 Accuracy : 0.975 


NGRAM Precision : 0.88 
 Recall : 0.73 
 Accuracy : 0.956 
