In [2]:
import pandas as pd

messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [3]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [6]:
wordnet = WordNetLemmatizer()
corpus = []
for i in range(0,len(messages)):
  review = re.sub('^[a-zA-z]'," ",messages["message"][i])
  review = review.lower()
  review = review.split()
  review = [wordnet.lemmatize(word) for word in review if not word in stopwords.words("english")]
  review = " ".join(review)
  corpus.append(review) #cleaning,lower,split,lemmatization

In [7]:
#bag of word Dictornary of words w.r. to words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() # words from aall sentences
X = cv.fit_transform(corpus).toarray()

In [8]:
X.shape,X

((5572, 8827), array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [9]:
corpus,messages.shape

(['jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...',
  'k lar... joking wif u oni...',
  "ree entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question(std txt rate)t&c's apply 08452810075over18's",
  'dun say early hor... u c already say...',
  'ah think go usf, life around though',
  "reemsg hey darling 3 week's word back! i'd like fun still? tb ok! xxx std chgs send, £1.50 rcv",
  'ven brother like speak me. treat like aid patent.',
  "per request 'melle melle (oru minnaminunginte nurungu vettam)' set callertune callers. press *9 copy friend callertune",
  'inner!! valued network customer selected receivea £900 prize reward! claim call 09061701461. claim code kl341. valid 12 hour only.',
  'ad mobile 11 month more? u r entitled update latest colour mobile camera free! call mobile update co free 08002986030',
  "'m gonna home soon want talk stuff anymore tonight, k? i've cried enough today.",
  'ix chance win 

In [10]:
y = pd.get_dummies(messages['label'])
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [11]:
y = y.iloc[:,1].values
y #considering 1 col from ham scam  which we pick 1 col which specify all  

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [12]:
(y == 1).sum(),(y == 0).sum()

(747, 4825)

In [13]:
!pip install imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:

import imblearn
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)

In [15]:
X_over.shape, y_over.shape,(y_over == 1).sum(),(y_over == 0).sum()

((9650, 8827), (9650,), 4825, 4825)

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_over,y_over,test_size=0.1,random_state  = 0,stratify = y_over )

In [17]:
from sklearn.naive_bayes import MultinomialNB
spam_detector_model = MultinomialNB(alpha=0.1).fit(X_train,y_train)
y_pred = spam_detector_model.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix,accuracy_score
confusion_m = confusion_matrix(y_test,y_pred)
print(confusion_m)
accuracy  = accuracy_score(y_test,y_pred)
print(accuracy)

[[475   8]
 [  8 474]]
0.983419689119171


In [19]:
#with tf-idf insted of bag of words
#now applying TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [20]:
import imblearn
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_over,y_over,test_size=0.1,random_state  = 0,stratify = y_over )

In [22]:
from sklearn.naive_bayes import MultinomialNB
spam_detector_model = MultinomialNB(alpha=0.1).fit(X_train,y_train)
y_pred = spam_detector_model.predict(X_test)

In [23]:
from sklearn.metrics import confusion_matrix,accuracy_score
confusion_m = confusion_matrix(y_test,y_pred)
print(confusion_m)
accuracy  = accuracy_score(y_test,y_pred)
print(accuracy)

[[474   9]
 [  4 478]]
0.9865284974093265


In [25]:
import pickle
pickle.dump(spam_detector_model, open('model.pkl','wb'))
appp=pickle.load(open('model.pkl','rb'))