In [5]:
#basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [6]:
#import nlp libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
#import model libraries
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [10]:
df=pd.read_csv('/content/spam.csv',encoding='latin-1',usecols=['v1','v2'])
df.columns=['label','message']

In [11]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [13]:
df['label'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
ham,0.865937
spam,0.134063


Highly uneven Distribution of Data

In [14]:
###clean the sentences by lowercasing using re
### tokenize the sentences
###apply lemmetization
### Train test split the data
### apply tfidf countvectorizer
### check the data
### apply Random Forest Classifier

In [18]:
lemmatizer=WordNetLemmatizer()
corpus=[]
for i in range(len(df)):
  review=re.sub('[^a-zA-Z]',' ',df['message'][i])
  review=review.lower()
  review=word_tokenize(review)
  corpus.append(' '.join(lemmatizer.lemmatize(word) for word in review))

In [22]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [34]:
#apply label encoder to label column in y variable
x=corpus
df["label"] = df["label"].map({"ham": 0, "spam": 1})
y=df["label"]

In [41]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [42]:
tf=TfidfVectorizer(max_features=3000)
x_train_text,x_test_text,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train=tf.fit_transform(x_train_text)
x_test=tf.transform(x_test_text)

In [45]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=3000)),
    ("model", MultinomialNB(alpha=1.0))
])

pipeline.fit(x_train_text, y_train)

y_pred = pipeline.predict(x_test_text)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test,y_pred))

print(confusion_matrix(y_test,y_pred))


Accuracy: 0.9695067264573991
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

[[965   0]
 [ 34 116]]


In [47]:
import pickle
pickle.dump(pipeline, open("spam_model.pkl", "wb"))

In [51]:
from sklearn.metrics import accuracy_score

y_train_pred = pipeline.predict(x_train_text)
y_test_pred  = pipeline.predict(x_test_text)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc  = accuracy_score(y_test, y_test_pred)

print("Train Accuracy:", train_acc)
print("Test Accuracy :", test_acc)
print("Gap           :", train_acc - test_acc)


Train Accuracy: 0.9816019744222572
Test Accuracy : 0.9695067264573991
Gap           : 0.012095247964858058
