<a href="https://colab.research.google.com/github/thegrouch4413/Predictive-Analysis/blob/main/NLP_Intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [45]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import Dataset

In [None]:
email_data_df = pd.read_csv('https://raw.githubusercontent.com/niteen11/data301_predictive_analytics_machine_learning/main/data/SMSSpamCollection', sep='\t',names=['class','message'])

# EDA

In [None]:
email_data_df.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
email_data_df['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
email_data_df['message'][4]

"Nah I don't think he goes to usf, he lives around here though"

In [None]:
len(email_data_df['message'][4])

61

In [None]:
max(email_data_df['message'].apply(len))

910

Find Longest Message

In [None]:
msg_910 = email_data_df[email_data_df['message'].apply(len)==910]

In [None]:
msg_910

Unnamed: 0,class,message
1085,ham,For me the love should start with attraction.i...


In [None]:
email_data_df.shape

(5572, 2)

In [None]:
msg_910.message

1085    For me the love should start with attraction.i...
Name: message, dtype: object

In [None]:
msg_910.message.iloc[0]

"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."

In [None]:
remove_punct = [char for char in msg_910.message.iloc[0] if char not in string.punctuation]

In [None]:
remove_punct = ''.join(remove_punct)

In [None]:
remove_punct

'For me the love should start with attractioni should feel that I need her every time around meshe should be the first thing which comes in my thoughtsI would start the day and end it with hershe should be there every time I dreamlove will be then when my every breath has her namemy life should happen around hermy life will be named to herI would cry for herwill give all my happiness and take all her sorrowsI will be ready to fight with anyone for herI will be in love when I will be doing the craziest things for herlove will be when I dont have to proove anyone that my girl is the most beautiful lady on the whole planetI will always be singing praises for herlove will be when I start up making chicken curry and end up makiing sambarlife will be the most beautiful thenwill get every morning and thank god for the day because she is with meI would like to say a lotwill tell later'

Longest Message with Punctuation removed

In [None]:
len(remove_punct)

888

In [None]:
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [None]:
remove_stopwords = [word for word in remove_punct.split() if word.lower() not in stopwords.words('english')]

remove Stopwords

In [None]:
remove_stopwords

['love',
 'start',
 'attractioni',
 'feel',
 'need',
 'every',
 'time',
 'around',
 'meshe',
 'first',
 'thing',
 'comes',
 'thoughtsI',
 'would',
 'start',
 'day',
 'end',
 'hershe',
 'every',
 'time',
 'dreamlove',
 'every',
 'breath',
 'namemy',
 'life',
 'happen',
 'around',
 'hermy',
 'life',
 'named',
 'herI',
 'would',
 'cry',
 'herwill',
 'give',
 'happiness',
 'take',
 'sorrowsI',
 'ready',
 'fight',
 'anyone',
 'herI',
 'love',
 'craziest',
 'things',
 'herlove',
 'dont',
 'proove',
 'anyone',
 'girl',
 'beautiful',
 'lady',
 'whole',
 'planetI',
 'always',
 'singing',
 'praises',
 'herlove',
 'start',
 'making',
 'chicken',
 'curry',
 'end',
 'makiing',
 'sambarlife',
 'beautiful',
 'thenwill',
 'get',
 'every',
 'morning',
 'thank',
 'god',
 'day',
 'meI',
 'would',
 'like',
 'say',
 'lotwill',
 'tell',
 'later']

##Shortcut to apply Remove Punctuation to all messages

In [None]:
def message_text_pre_process(text_message):
  remove_punct = [char for char in text_message if char not in string.punctuation]
  remove_punct = ''.join(remove_punct)
  remove_stopwords = [word for word in remove_punct.split() if word.lower() not in stopwords.words('english')]
  return remove_stopwords

In [None]:
email_data_df['message'].head(10).apply(message_text_pre_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
5    [FreeMsg, Hey, darling, 3, weeks, word, back, ...
6    [Even, brother, like, speak, treat, like, aids...
7    [per, request, Melle, Melle, Oru, Minnaminungi...
8    [WINNER, valued, network, customer, selected, ...
9    [mobile, 11, months, U, R, entitled, Update, l...
Name: message, dtype: object

In [40]:
bag_of_words = CountVectorizer(analyzer=message_text_pre_process).fit(email_data_df['message'])

In [None]:
bag_of_words_trf = bag_of_words.transform(email_data_df['message'])

In [43]:
tfidf_fit = TfidfTransformer().fit(bag_of_words_trf)

In [44]:
tfidf_trf = tfidf_fit.transform(bag_of_words_trf)

In [46]:
spam_detector_model = MultinomialNB().fit(tfidf_trf,email_data_df['class'])

In [47]:
test_message = email_data_df['message'][10]

In [48]:
bag_of_words_test_message = bag_of_words.transform([test_message])

In [49]:
tfidf_test_messsge = tfidf_fit.transform(bag_of_words_test_message)

In [52]:
spam_detector_model.predict(tfidf_test_messsge)[0]

'ham'

In [53]:
test_message = email_data_df['message'][12]

In [54]:
bag_of_words_test_message = bag_of_words.transform([test_message])

In [55]:
tfidf_test_messsge = tfidf_fit.transform(bag_of_words_test_message)

In [56]:
spam_detector_model.predict(tfidf_test_messsge)[0]

'spam'

In [57]:
prediction_for_all_messages = spam_detector_model.predict(tfidf_trf)

In [58]:
from sklearn.metrics import classification_report

In [59]:
print(classification_report(email_data_df['class'],prediction_for_all_messages))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      4825
        spam       1.00      0.85      0.92       747

    accuracy                           0.98      5572
   macro avg       0.99      0.92      0.95      5572
weighted avg       0.98      0.98      0.98      5572



# Pipeline Building

In [60]:
from sklearn.model_selection import train_test_split

In [61]:
msg_train, msg_test, class_train, class_test = train_test_split(email_data_df['message'],email_data_df['class']) 

In [62]:
print(msg_train.shape)
print(msg_test.shape)
print(class_train.shape)
print(class_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [63]:
from sklearn.pipeline import Pipeline

In [64]:
text_pipeline = Pipeline([
                          ('bag_of_words',CountVectorizer(analyzer=message_text_pre_process)),
                          ('tfidf',TfidfTransformer()),
                          ('classifier', MultinomialNB())
])

In [65]:
text_pipeline.fit(msg_train,class_train)

Pipeline(steps=[('bag_of_words',
                 CountVectorizer(analyzer=<function message_text_pre_process at 0x7f27f14f9b00>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [66]:
text_pred = text_pipeline.predict(msg_test)

In [67]:
print(classification_report(text_pred,class_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1278
        spam       0.69      1.00      0.82       115

    accuracy                           0.96      1393
   macro avg       0.85      0.98      0.90      1393
weighted avg       0.97      0.96      0.97      1393



In [68]:
msg_test.iloc[0]

'happened here while you were adventuring'

In [69]:
class_test.iloc[0]

'ham'