In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
df.shape

(5572, 5)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace = True)

In [5]:
df.rename(columns={'v1':'label','v2':'text'},inplace = True)

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

In [7]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [9]:
df.duplicated().sum()

403

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.shape

(5169, 2)

In [12]:
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [13]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

In [14]:
df['transformed_text'] = df['text'].apply(transform_text)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split

In [17]:
X_train,X_test,y_train,y_test = train_test_split(df['transformed_text'],df['label'],test_size=0.2,random_state= 125 )

In [18]:
X_test

2201                               boy late 2 home father
2698                                      88066 lost help
205        ìï say like dat dun buy ericsson oso oredi lar
245                         late said websit dont slipper
3387                              kalli dismissi 2nd test
                              ...                        
5015                 hey gal u wan na meet 4 dinner nìâte
1063    new local date area lot new peopl regist area ...
623                         sorri got thing may pub later
2683                           okay chase dream good next
4600    freemsg txt call 86888 claim reward 3 hour tal...
Name: transformed_text, Length: 1034, dtype: object

In [19]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(
    TfidfVectorizer(max_features=3000),
    # CountVectorizer(),
    MultinomialNB()
    )
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(max_features=3000)),
                ('multinomialnb', MultinomialNB())])

In [20]:
pred = pipe.predict(X_test)

In [21]:
print(accuracy_score(y_test,pred))
print(confusion_matrix(y_test,pred))
print(precision_score(y_test,pred))
print(classification_report(y_test,pred))

0.9835589941972921
[[907   0]
 [ 17 110]]
1.0
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       907
           1       1.00      0.87      0.93       127

    accuracy                           0.98      1034
   macro avg       0.99      0.93      0.96      1034
weighted avg       0.98      0.98      0.98      1034



In [22]:
pipe.score(X_train,y_train)

0.9804111245465538

In [23]:
import pickle
pickle.dump(pipe,open('model.pkl','wb'))

In [24]:
pipe.predict([transform_text('You’ve won a prize! Go to [link] to claim your $500 Amazon gift card.')])

array([1])

In [25]:
pipe.predict([transform_text('Dear Investor,With reference to NSE circular NSE/INSP/46704 dated December 17, 2020 and NSE/INSP/46960 dated January 08, 2021, Stock Brokers are required to upload clients fund balance and securities balance on weekly basis.')])

array([1])

In [26]:
txt = '''' [Name], we came across a package from [a recent month] pending for you. Kindly claim ownership and confirm for delivery here: [link]. '''

In [27]:
pipe.predict([transform_text(txt)])

array([0])