In [None]:
### NLP Text Calssification Model using Counter Vector(CV) and Term Frequency-Inverse Document Frequency(TF-)

#### Create a bag of word model from spam dataset

In [47]:
import pandas as pd
spam_df = pd.read_csv('data/spam.csv', usecols=['v1', 'v2'], encoding='latin1')
spam_df.rename(columns={'v1': 'Label', 'v2': 'Message'}, inplace=True)

In [51]:
spam_df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [63]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to C:\Users\Uthanda
[nltk_data]     Ramu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [65]:
ps = PorterStemmer()

In [67]:
corpus = []
for i in range(len(spam_df)):
    review = re.sub('[^a-zA-Z]', ' ', spam_df["Message"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)

#### Using CounterVectorizer and TfidfVectorizer for tokenization

In [54]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features=2500, ngram_range=(1,2))
tfidf = TfidfVectorizer(max_features=2500, ngram_range=(1,2))

In [71]:
x_cv = cv.fit_transform(corpus).toarray()
x_tfidf = tfidf.fit_transform(corpus).toarray()

In [77]:
y = pd.get_dummies(spam_df['Label'])
y = y.iloc[:, 0].values

In [79]:
y.shape

(5572,)

#### Using CounterVectorizer and TfidfVectorizer tokens for training

In [87]:
from sklearn.model_selection import train_test_split
x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(x_cv, y, test_size=0.2, random_state=21)
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(x_tfidf, y, test_size=0.2, random_state=23)

In [91]:
#Training
from sklearn.naive_bayes import MultinomialNB
spam_detect_model_cv = MultinomialNB().fit(x_train_cv, y_train_cv)
spam_detect_model_tfidf = MultinomialNB().fit(x_train_tfidf, y_train_tfidf)

In [93]:
#Prediction
y_pred_cv = spam_detect_model_cv.predict(x_test_cv)
y_pred_tfidf = spam_detect_model_tfidf.predict(x_test_tfidf)

In [101]:
#Metrics evaluation on cv tokens
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test_cv, y_pred_cv))
print(classification_report(y_test_cv, y_pred_cv))

0.9847533632286996
              precision    recall  f1-score   support

       False       0.94      0.94      0.94       140
        True       0.99      0.99      0.99       975

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [103]:
#Metrics evaluation on tfidf tokens
print(accuracy_score(y_test_tfidf, y_pred_tfidf))
print(classification_report(y_test_tfidf, y_pred_tfidf))

0.9802690582959641
              precision    recall  f1-score   support

       False       0.99      0.85      0.91       136
        True       0.98      1.00      0.99       979

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

