In [1]:
#Project Flow

Problem Statement
Data Gathering
Data Preprocessing : Here we perform some operation on data A. Tokenization B. Lower Case C. Stopwords D. Lemmatization / Stemming
Vectorization (Convert Text data into the Vector): A. Bag Of Words (CountVectorizer) B. TF-IDF
Model Building : A. Model Object Initialization B. Train and Test Model
Model Evaluation : A. Accuracy Score B. Confusition Matrix C. Classification Report
Model Deployment
Prediction on Client Data

SyntaxError: invalid syntax (3054383612.py, line 3)

In [1]:
import pandas as pd

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
from sklearn.feature_extraction.text import TfidfVectorizer


# 1 .Data Gathering

In [3]:
df = pd.read_csv("SMSSpamCollection.txt", sep = '\t', names = ['Label','Msg'] )
df.head()

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   Msg     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
 df.isna().sum()

Label    0
Msg      0
dtype: int64

In [19]:
#to check the count of ham and spam 


In [6]:
df['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [7]:
corpus = []
lm = WordNetLemmatizer()
for i in range (len(df)):
    review = re.sub('^a-zA-Z0-9',' ',df['Msg'][i])
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = " ".join(review)
    corpus.append(review)   

In [8]:
df['Msg'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [9]:
len(df['Msg'])

5572

In [10]:
len(corpus)

5572

In [11]:
df['Msg']=corpus
df.head()

Unnamed: 0,Label,Msg
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"


In [12]:
df['Msg'][0]

'go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...'

# Model Building

In [13]:
x = df['Msg']
y = df['Label']

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 10)

In [15]:
len(x_train), len(y_train)

(4457, 4457)

In [16]:
len(x_test),len(y_test)

(1115, 1115)

# Vectorization (Convert Text Data Into The Vectors)

In [17]:
tf_obj = TfidfVectorizer()
x_train_tfidf = tf_obj.fit_transform(x_train).toarray()
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
x_train_tfidf.shape

(4457, 7460)

# Pipeline

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [21]:
model = Pipeline([('tfidf',TfidfVectorizer()),('model',MultinomialNB())])

In [22]:
model.fit(x_train,y_train)

In [23]:
prediction_testdata= model.predict(x_test)
accuracy_testdata = accuracy_score ( y_test , prediction_testdata)
print ( accuracy_testdata)

0.9605381165919282


In [24]:
prediction_traindata = model.predict(x_train)
accuracy_traindata = accuracy_score ( y_train , prediction_traindata)
print ( accuracy_traindata)

0.9831725375813327


In [25]:
from sklearn.metrics import precision_score

In [26]:
precision_traindata = precision_score ( y_train , prediction_traindata, pos_label='ham')
print( precision_traindata)
precision_testdata = precision_score ( y_test , prediction_testdata, pos_label='ham')
print ( precision_testdata)

0.9809063136456212
0.9566929133858267


In [27]:
from sklearn.metrics import recall_score
precision_testdata = recall_score ( y_test , prediction_testdata , pos_label='spam')
print ( precision_testdata)
precision_traindata = recall_score ( y_train , prediction_traindata , pos_label='spam')
print ( precision_testdata)

0.6923076923076923
0.6923076923076923


In [28]:
from sklearn.metrics import f1_score
f1_score_traindata = f1_score( y_train , prediction_traindata, pos_label='spam')
print ( f1_score_traindata)
f1_score_testdata = f1_score( y_test , prediction_testdata, pos_label='spam')
print ( f1_score_testdata)

0.9338040600176523
0.8181818181818182


In [29]:
#Confusion Matrix on Testing Data
y_pred_test = model.predict(x_test)
print("Confusion Matrix on Test Data:\n", confusion_matrix(y_test,y_pred_test))

Confusion Matrix on Test Data:
 [[972   0]
 [ 44  99]]


In [30]:
y_pred_test = model.predict(x_test)
print("Classification Reportx on Test Data:\n", classification_report(y_test,y_pred_test))

Classification Reportx on Test Data:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       972
        spam       1.00      0.69      0.82       143

    accuracy                           0.96      1115
   macro avg       0.98      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115



In [31]:
def preprocess_data(text):
    review = re.sub('^a-zA-Z0-9',' ',text)
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = " ".join(review)
    return [review]

In [32]:
user_data = df['Msg'][3]
print(user_data)
user_data = preprocess_data(user_data)
user_data

u dun say early hor... u c already say...


['u dun say early hor... u c already say...']

In [33]:
model.predict(user_data)[0]

'ham'

In [34]:
class prediction:
    
    def __init__(self,data):
        self.data = data
        
    def user_data_preprocessing(self):
        lm = WordNetLemmatizer()
        review = re.sub('^a-zA-Z0-9',' ',self.data)
        review = review.lower()
        review = review.split()
        review = [data for data in review if data not in stopwords.words('english')]
        review = [lm.lemmatize(data) for data in review]
        review = " ".join(review)
        return [review]
    
    def user_data_prediction(self):
        preprocess_data = self.user_data_preprocessing()
        
        if model.predict(preprocess_data)[0] == 'spam':
            return 'This Message is Spam'
            
        else:
            return 'This Message is Ham' 

In [35]:
df.head()

Unnamed: 0,Label,Msg
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"


In [36]:
user_data = df['Msg'][2]
print(user_data)
prediction(user_data).user_data_prediction()

free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's apply 08452810075over18's


'This Message is Spam'

In [37]:
user_data = df['Msg'][3]
print(user_data)
prediction(user_data).user_data_prediction()

u dun say early hor... u c already say...


'This Message is Ham'

In [39]:
user_data = str(input("Please enter the SMS: "))
print(user_data)
prediction(user_data).user_data_prediction()

Please enter the SMS:  india won the trophy 


india won the trophy 


'This Message is Ham'

In [40]:
user_data = df['Msg'][3]
print(user_data)
prediction(user_data).user_data_prediction()

u dun say early hor... u c already say...


'This Message is Ham'