<a href="https://colab.research.google.com/github/thegrouch4413/Predictive-Analysis/blob/main/NLP_Practice_Class_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
email_data_df = pd.read_csv('https://raw.githubusercontent.com/glopez21/ML-Data/main/SMSSpamCollection.csv', sep='\t',names=['class','message'])

In [11]:
email_data_df.head()

Unnamed: 0,class,message
0,"label,text",
1,"0.0,Go until jurong point, crazy.. Available o...",
2,"0.0,Ok lar... Joking wif u oni...",
3,"1.0,Free entry in 2 a wkly comp to win FA Cup ...",
4,"0.0,U dun say so early hor... U c already then...",


In [13]:
email_data_df['class2'] = email_data_df['class'].str[0:3]
email_data_df['message2'] = email_data_df['class'].str[4:]
new_df = email_data_df.drop(['class', 'message'], axis = 1)
df = new_df.drop(0)

In [14]:
df.head(5)

Unnamed: 0,class2,message2
1,0.0,"Go until jurong point, crazy.. Available only ..."
2,0.0,Ok lar... Joking wif u oni...
3,1.0,Free entry in 2 a wkly comp to win FA Cup fina...
4,0.0,U dun say so early hor... U c already then say...
5,0.0,"Nah I dont think he goes to usf, he lives arou..."


In [15]:
def message_text_pre_process(text_message):
  remove_punct = [char for char in text_message if char not in string.punctuation]
  remove_punct = ''.join(remove_punct)
  remove_stopwords = [word for word in remove_punct.split() if word.lower() not in stopwords.words('english')]
  return remove_stopwords

In [16]:
df['message2'].head(10).apply(message_text_pre_process)

1     [Go, jurong, point, crazy, Available, bugis, n...
2                        [Ok, lar, Joking, wif, u, oni]
3     [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
4         [U, dun, say, early, hor, U, c, already, say]
5     [Nah, dont, think, goes, usf, lives, around, t...
6     [FreeMsg, Hey, darling, 3, weeks, word, back, ...
7     [Even, brother, like, speak, treat, like, aids...
8     [per, request, Melle, Melle, Oru, Minnaminungi...
9     [WINNER, valued, network, customer, selected, ...
10    [mobile, 11, months, U, R, entitled, Update, l...
Name: message2, dtype: object

In [17]:
bag_of_words = CountVectorizer(analyzer=message_text_pre_process).fit(email_data_df['message2'])

In [18]:
bag_of_words_trf = bag_of_words.transform(df['message2'])

In [19]:
tfidf_fit = TfidfTransformer().fit(bag_of_words_trf)

In [20]:
tfidf_trf = tfidf_fit.transform(bag_of_words_trf)

In [21]:
spam_detector_model = MultinomialNB().fit(tfidf_trf,df['class2'])

In [22]:
test_message = df['message2'][10]

In [23]:
bag_of_words_test_message = bag_of_words.transform([test_message])

In [24]:
tfidf_test_messsge = tfidf_fit.transform(bag_of_words_test_message)

In [25]:
spam_detector_model.predict(tfidf_test_messsge)[0]

'1.0'

In [26]:
prediction_for_all_messages = spam_detector_model.predict(tfidf_trf)

In [27]:
from sklearn.metrics import classification_report

In [28]:
print(classification_report(df['class2'],prediction_for_all_messages))

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      4827
         1.0       1.00      0.85      0.92       747

    accuracy                           0.98      5574
   macro avg       0.99      0.92      0.95      5574
weighted avg       0.98      0.98      0.98      5574



# Pipeline Building

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
msg_train, msg_test, class_train, class_test = train_test_split(df['message2'],df['class2']) 

In [31]:
print(msg_train.shape)
print(msg_test.shape)
print(class_train.shape)
print(class_test.shape)

(4180,)
(1394,)
(4180,)
(1394,)


In [32]:
from sklearn.pipeline import Pipeline

In [33]:
text_pipeline = Pipeline([
                          ('bag_of_words',CountVectorizer(analyzer=message_text_pre_process)),
                          ('tfidf',TfidfTransformer()),
                          ('classifier', MultinomialNB())
])

In [34]:
text_pipeline.fit(msg_train,class_train)

Pipeline(steps=[('bag_of_words',
                 CountVectorizer(analyzer=<function message_text_pre_process at 0x7f5b0839d440>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [35]:
text_pred = text_pipeline.predict(msg_test)

In [36]:
print(classification_report(text_pred,class_test))

              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98      1275
         1.0       0.70      1.00      0.82       119

    accuracy                           0.96      1394
   macro avg       0.85      0.98      0.90      1394
weighted avg       0.97      0.96      0.97      1394



In [37]:
msg_test.iloc[0]

'You made my day. Do have a great day too.'

In [38]:
class_test.iloc[0]

'0.0'