## NLP-Based Spam Filter

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import re
import nltk

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
# or 
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv("mail_data.csv")
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [6]:
df['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [7]:
df.loc[df['Category']=='spam','Category']=0
df.loc[df['Category']=='ham','Category']=1


In [8]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


### Text Preprocessing and Normalization Pipeline (Stemming)

In [9]:
clean_text=[]
# lemmatizer=WordNetLemmatizer()
ps=PorterStemmer()
for text in df['Message']:
    text = str(text).lower()                      # lowercase the text
    text = re.sub(r"http\S+|www\S+", " ", text)   # remove links
    text = re.sub(r"[^a-z0-9\s]", " ", text)      # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()      # Deals with the spaces
    text=text.split()
    text=[ps.stem(word) for word in text if not word in stopwords.words('english')]
    # text=[lemmatizer.lemmatize(word,pos='n') for word in text ]
    text=' '.join(text)
    clean_text.append(text)

In [10]:
df['Message']=clean_text 
df.head()

Unnamed: 0,Category,Message
0,1,go jurong point crazi avail bugi n great world...
1,1,ok lar joke wif u oni
2,0,free entri 2 wkli comp win fa cup final tkt 21...
3,1,u dun say earli hor u c alreadi say
4,1,nah think goe usf live around though


In [11]:
x=df["Message"]
y=df["Category"]

In [12]:
x ,y

(0       go jurong point crazi avail bugi n great world...
 1                                   ok lar joke wif u oni
 2       free entri 2 wkli comp win fa cup final tkt 21...
 3                     u dun say earli hor u c alreadi say
 4                    nah think goe usf live around though
                               ...                        
 5567    2nd time tri 2 contact u u 750 pound prize 2 c...
 5568                                b go esplanad fr home
 5569                                    piti mood suggest
 5570    guy bitch act like interest buy someth els nex...
 5571                                       rofl true name
 Name: Message, Length: 5572, dtype: object,
 0       1
 1       1
 2       0
 3       1
 4       1
        ..
 5567    0
 5568    1
 5569    1
 5570    1
 5571    1
 Name: Category, Length: 5572, dtype: object)

### Train-Test Split

In [13]:
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [14]:

X_train.shape, X_test.shape



((4457,), (1115,))

### TF-IDF Vectorization for Feature Extraction

In [15]:
tfidf_vect=TfidfVectorizer(max_features=5000,ngram_range=(1, 2))
X_train_vect=tfidf_vect.fit_transform(X_train)
X_test_vect=tfidf_vect.transform(X_test)

In [16]:
Y_train.info()

<class 'pandas.core.series.Series'>
Index: 4457 entries, 1978 to 860
Series name: Category
Non-Null Count  Dtype 
--------------  ----- 
4457 non-null   object
dtypes: object(1)
memory usage: 69.6+ KB


In [17]:
# the data type is object so we convert it into integer
Y_train=Y_train.astype("int")
Y_test=Y_test.astype("int")

In [18]:
Y_test.info() , Y_test.info()

<class 'pandas.core.series.Series'>
Index: 1115 entries, 3245 to 4293
Series name: Category
Non-Null Count  Dtype
--------------  -----
1115 non-null   int64
dtypes: int64(1)
memory usage: 17.4 KB
<class 'pandas.core.series.Series'>
Index: 1115 entries, 3245 to 4293
Series name: Category
Non-Null Count  Dtype
--------------  -----
1115 non-null   int64
dtypes: int64(1)
memory usage: 17.4 KB


(None, None)

In [19]:
print(X_train_vect)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 42567 stored elements and shape (4457, 5000)>
  Coords	Values
  (0, 3803)	0.1808953784554354
  (0, 4868)	0.2063940379087429
  (0, 73)	0.23083251037745958
  (0, 4823)	0.24740752938477936
  (0, 4920)	0.23437022534488225
  (0, 1191)	0.2943700110381919
  (0, 4002)	0.16820783810283962
  (0, 4260)	0.18285336991075488
  (0, 301)	0.3076532851499006
  (0, 1387)	0.21596990291289944
  (0, 4031)	0.20890142451053134
  (0, 3825)	0.3164625616243326
  (0, 4869)	0.2802891730567505
  (0, 77)	0.2890984495311825
  (0, 4013)	0.25495002116918614
  (0, 4262)	0.3076532851499006
  (1, 1875)	0.3065262404987398
  (1, 4185)	0.3508961320068731
  (1, 4531)	0.3316887877237347
  (1, 388)	0.2743156681165833
  (1, 1319)	0.24743531396135038
  (1, 3925)	0.4124577188266966
  (1, 1921)	0.2364749555069939
  (1, 1370)	0.3079763743197681
  (1, 2195)	0.26705408287533616
  :	:
  (4452, 3036)	0.4670752347285049
  (4453, 881)	0.29054125681050047
  (4453, 4175)	0.418226

In [20]:
# model= LogisticRegression()

## Model Training and Evaluation (Multinomial Naive Bayes)

In [21]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train_vect,Y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [22]:
y_pred = model.predict(X_test_vect)
print("Accuracy:", accuracy_score(Y_test, y_pred))
print("\nClassification Report:\n", classification_report(Y_test, y_pred))

Accuracy: 0.9730941704035875

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89       149
           1       0.97      1.00      0.98       966

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



## Testing on manual data

In [27]:
input=["!!! **CONGRATULATIONS**!!! You have been SELECTED for a FREE $1000 Gift Card! Click HERE NOW to claim your PRIZE."]
x_vict=tfidf_vect.transform(input)
y_pred_t = model.predict(x_vict)

if (y_pred_t==0):
    print("This is a spam mail")
else :
    print("This is a genuine mail")

This is a spam mail


## Model loading using jobliob

In [24]:
model_data ={
    "model": model,
    "vect": tfidf_vect
}
joblib.dump(model_data,"Model.joblib")

['Model.joblib']