In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

sms_spam = \
pd.read_csv('SMSSpamCollection', sep='\t', header=None, \
            names=['Label', 'SMS'])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
sms_spam['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

In [3]:
# label spam mail as 0;  ham mail as 1;

sms_spam.loc[sms_spam['Label'] == 'spam', 'Label'] = 1
sms_spam.loc[sms_spam['Label'] == 'ham', 'Label'] = 0

sms_spam

Unnamed: 0,Label,SMS
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [4]:
sms_spam["SMS_Copy"] = sms_spam["SMS"]
sms_spam.head()

Unnamed: 0,Label,SMS,SMS_Copy
0,0,"Go until jurong point, crazy.. Available only ...","Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro...","Nah I don't think he goes to usf, he lives aro..."


In [5]:
import nltk

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem(txt):
    lst = []
    
    for i in txt.split():
        lst.append(ps.stem(i))
        
    return " ".join(lst)

sms_spam["SMS"] = sms_spam["SMS"].apply(stem)

In [6]:
sms_spam

Unnamed: 0,Label,SMS,SMS_Copy
0,0,"go until jurong point, crazy.. avail onli in b...","Go until jurong point, crazy.. Available only ..."
1,0,ok lar... joke wif u oni...,Ok lar... Joking wif u oni...
2,1,free entri in 2 a wkli comp to win fa cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,u dun say so earli hor... u c alreadi then say...,U dun say so early hor... U c already then say...
4,0,"nah i don't think he goe to usf, he live aroun...","Nah I don't think he goes to usf, he lives aro..."
...,...,...,...
5567,1,thi is the 2nd time we have tri 2 contact u. u...,This is the 2nd time we have tried 2 contact u...
5568,0,will ü b go to esplanad fr home?,Will ü b going to esplanade fr home?
5569,0,"pity, * wa in mood for that. so...ani other su...","Pity, * was in mood for that. So...any other s..."
5570,0,the guy did some bitch but i act like i'd be i...,The guy did some bitching but I acted like i'd...


In [7]:
# separating the data as texts and label

X = sms_spam['SMS']

Y = sms_spam['Label']

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

# Transform the text data to bag-of-words feature vectors
feature_extraction = CountVectorizer(stop_words='english')

X_train_features_bow = feature_extraction.fit_transform(X_train)
X_test_features_bow = feature_extraction.transform(X_test)

# Convert Y_train and Y_test values to integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [36]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [11]:
X_train_features_bow.toarray().shape

(4457, 7087)

In [39]:
X_train_features_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [38]:
model = LogisticRegression()

In [13]:
# training the Logistic Regression model with the training data
model.fit(X_train_features_bow, Y_train)

Evaluating the trained model

In [14]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features_bow)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [15]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9964101413506843


In [16]:
sms_spam["SMS"][2]

"free entri in 2 a wkli comp to win fa cup final tkt 21st may 2005. text fa to 87121 to receiv entri question(std txt rate)t&c' appli 08452810075over18'"

In [17]:
input_mail = [sms_spam["SMS"][2]]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
      print('Spam SMS')

else:
      print('Ham SMS')

[1]
Spam SMS


In [18]:
# Recomm

In [19]:
spam_only = sms_spam[sms_spam["Label"] == 1].reset_index(drop =True)
spam_only.head()

Unnamed: 0,Label,SMS,SMS_Copy
0,1,free entri in 2 a wkli comp to win fa cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
1,1,freemsg hey there darl it' been 3 week' now an...,FreeMsg Hey there darling it's been 3 week's n...
2,1,winner!! as a valu network custom you have bee...,WINNER!! As a valued network customer you have...
3,1,had your mobil 11 month or more? u r entitl to...,Had your mobile 11 months or more? U R entitle...
4,1,"six chanc to win cash! from 100 to 20,000 poun...","SIX chances to win CASH! From 100 to 20,000 po..."


In [20]:
cv = CountVectorizer(stop_words='english')

In [21]:
vector = cv.fit_transform(spam_only['SMS']).toarray()
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
vector.shape

(747, 2639)

In [23]:
list(cv.get_feature_names_out())[500:510]

['440',
 '4403ldnw1a7rw18',
 '44345',
 '447797706009',
 '447801259231',
 '448712404000',
 '449050000301',
 '449071512431',
 '45',
 '450']

In [24]:
# Calculate vectors

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
similarity = cosine_similarity(vector)

In [27]:
similarity.shape

(747, 747)

In [28]:
similarity

array([[1.        , 0.05063697, 0.        , ..., 0.1315587 , 0.10127394,
        0.04499213],
       [0.05063697, 1.        , 0.        , ..., 0.        , 0.06666667,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.05923489,
        0.15789474],
       ...,
       [0.1315587 , 0.        , 0.        , ..., 1.        , 0.17320508,
        0.        ],
       [0.10127394, 0.06666667, 0.05923489, ..., 0.17320508, 1.        ,
        0.11846978],
       [0.04499213, 0.        , 0.15789474, ..., 0.        , 0.11846978,
        1.        ]])

In [29]:
msg = "you win 1000 cash prize"

query_vec = cv.transform([msg])
word_similarity = cosine_similarity(query_vec, vector)[0]
word_similarity.shape

(747,)

In [30]:
most_similar = sorted(list(enumerate(word_similarity)), reverse=True, key = lambda x:x[-1])[:10]
for i in most_similar:
    print(spam_only.iloc[i[0],2])

Win a £1000 cash prize or a prize worth £5000
You have WON a guaranteed £1000 cash or a £2000 prize.To claim yr prize call our customer service representative on
You have WON a guaranteed £1000 cash or a £2000 prize. To claim yr prize call our customer service representative on 08714712394 between 10am-7pm
This is the 2nd attempt to contract U, you have won this weeks top prize of either £1000 cash or £200 prize. Just call 09066361921
You have WON a guaranteed £1000 cash or a £2000 prize. To claim yr prize call our customer service representative on 08714712379 between 10am-7pm Cost 10p
You have WON a guaranteed £1000 cash or a £2000 prize. To claim yr prize call our customer service representative on 08714712412 between 10am-7pm Cost 10p
Congratulations YOU'VE Won. You're a Winner in our August £1000 Prize Draw. Call 09066660100 NOW. Prize Code 2309.
Please call our customer service representative on 0800 169 6031 between 10am-9pm as you have WON a guaranteed £1000 cash or £5000 prize

In [31]:
def type_of_msgs(msg):
    query_vec = cv.transform([msg])
    word_similarity = cosine_similarity(query_vec, vector)[0]
    most_similar = sorted(list(enumerate(word_similarity)), reverse=True, key = lambda x:x[-1])[:10]
    for i in most_similar:
        print(spam_only.iloc[i[0],2])

In [32]:
type_of_msgs("You are great! You have won a car. Please call for more details")

LIFE has never been this much fun and great until you came in. You made it truly special for me. I won't forget you! enjoy @ one gbp/sms
449050000301 You have won a £2,000 price! To claim, call 09050000301.
Money i have won wining number 946 wot do i do next
Claim a 200 shopping spree, just call 08717895698 now! Have you won! MobStoreQuiz10ppm
You have won ?1,000 cash or a ?2,000 prize! To claim, call09050000327
You've won tkts to the EURO2004 CUP FINAL or £800 CASH, to collect CALL 09058099801 b4190604, POBOX 7876150ppm
You've won tkts to the EURO2004 CUP FINAL or £800 CASH, to collect CALL 09058099801 b4190604, POBOX 7876150ppm
our mobile number has won £5000, to claim calls us back or ring the claims hot line on 09050005321.
You have won ?1,000 cash or a ?2,000 prize! To claim, call09050000327. T&C: RSTM, SW7 3SS. 150ppm
Great News! Call FREEFONE 08006344447 to claim your guaranteed £1000 CASH or £2000 gift. Speak to a live operator NOW!


In [33]:
sms_spam["SMS_Copy"][5567]

'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'

In [34]:
sms_spam

Unnamed: 0,Label,SMS,SMS_Copy
0,0,"go until jurong point, crazy.. avail onli in b...","Go until jurong point, crazy.. Available only ..."
1,0,ok lar... joke wif u oni...,Ok lar... Joking wif u oni...
2,1,free entri in 2 a wkli comp to win fa cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,u dun say so earli hor... u c alreadi then say...,U dun say so early hor... U c already then say...
4,0,"nah i don't think he goe to usf, he live aroun...","Nah I don't think he goes to usf, he lives aro..."
...,...,...,...
5567,1,thi is the 2nd time we have tri 2 contact u. u...,This is the 2nd time we have tried 2 contact u...
5568,0,will ü b go to esplanad fr home?,Will ü b going to esplanade fr home?
5569,0,"pity, * wa in mood for that. so...ani other su...","Pity, * was in mood for that. So...any other s..."
5570,0,the guy did some bitch but i act like i'd be i...,The guy did some bitching but I acted like i'd...


In [43]:
sms_spam["SMS_Copy"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'