### Spam or Ham Classification

In [1]:
# importing the dataset
import pandas as pd
messages=pd.read_csv('SMSSpamCollection.txt',sep='\t', names=["label","message"])
# sep='\t' -- separator is tab here 

In [3]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
# pick up any record
messages['message'].loc[451]

'hanks lotsly!'

In [7]:
# Data Cleaning and Preprocessing - tokenization, stopwords, stemming and lemmatization
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vartikathapa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [13]:
corpus=[]
for i in range(len(messages)):
    review=re.sub('[^a-zA-Z0-9]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)
    

In [15]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

In [17]:
# Convert text to vectors
# Creating Bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,binary=True,ngram_range=(2,2))
# max_features=2500 -- to 2500 features based on frequency
X=cv.fit_transform(corpus).toarray()
# cv is a model

In [19]:
X


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [21]:
X.shape

(5572, 2500)

In [23]:
# label encoding for spam and ham
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [25]:
y

array([False, False,  True, ..., False, False, False])

In [27]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

In [29]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [31]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [33]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [35]:
# Prediction
y_pred=spam_detect_model.predict(X_test)

In [37]:
y_pred

array([False,  True, False, ..., False,  True, False])

In [39]:
from sklearn.metrics import accuracy_score,classification_report

In [41]:
score=accuracy_score(y_test,y_pred)
print(score)

0.9721973094170404


In [43]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.97      1.00      0.98       955
        True       1.00      0.81      0.89       160

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [45]:
# Creating TF-IDF Model
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X=tv.fit_transform(corpus).toarray()

In [47]:
# Train Test Split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [49]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [51]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [53]:
score=accuracy_score(y_test,y_pred)
print(score)

0.9811659192825112


In [55]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       976
        True       0.87      1.00      0.93       139

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [57]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)

In [58]:
y_pred=classifier.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.9838565022421525
              precision    recall  f1-score   support

       False       0.98      1.00      0.99       955
        True       1.00      0.89      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [61]:
# Bag of Words model can also be applied after train ,test,split


### Word2Vec Implementation


In [64]:
!pip install gensim



In [66]:
# Creating model from scratch

In [68]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [70]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [71]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [74]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [76]:
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))
# simple_preprocess --- converts a document into a list of lowercase tokens


In [78]:
words

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'selected',
  'receivea',
 

In [80]:
import gensim

In [82]:
# Lets train Word2Vec model from scratch
model=gensim.models.Word2Vec(words,window=5,min_count=2)
# vector_size --- to give dimensions ;by default its is 100
# window -- for context 
# min_count=2 --- ignores words with total frequency lower than this 


In [84]:
model.wv.index_to_key
# displays vocabulary

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need',
 'one',
 'txt',
 'today',
 'going',
 'stop',
 'home',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'think',
 'tell',
 'week',
 'hi',
 'phone',
 'new',
 'later',
 'please',
 'pls',
 'co',
 'msg',
 'min',
 'make',
 'night',
 'dear',
 'message',
 'well',
 'say',
 'thing',
 'much',
 'oh',
 'hope',
 'claim',
 'great',
 'hey',
 'give',
 'number',
 'happy',
 'wat',
 'friend',
 'work',
 'way',
 'yes',
 'www',
 'prize',
 'let',
 'right',
 'tomorrow',
 'already',
 'tone',
 'ask',
 'win',
 'said',
 'life',
 'cash',
 'amp',
 'yeah',
 'im',
 'really',
 'meet',
 'babe',
 'find',
 'miss',
 'morning',
 'thanks',
 'last',
 'uk',
 'service',
 'year',
 'anything',
 'care',
 'would',
 'com',
 'also',
 'lol',
 'nokia',
 'feel',
 'every',
 'keep',
 'sure',
 'pick',
 'urgent',
 'sent',
 'contact',


In [86]:
model.corpus_count
# gives vocabulary count

5564

In [88]:
model.epochs

5

In [90]:
model.wv.similar_by_word('kid')
# to find out similar words from the dataset itself

[('work', 0.9977803230285645),
 ('much', 0.997776448726654),
 ('went', 0.9976928234100342),
 ('money', 0.9976789951324463),
 ('said', 0.9976316690444946),
 ('love', 0.9976289868354797),
 ('going', 0.9976272583007812),
 ('day', 0.9976223707199097),
 ('make', 0.9976174235343933),
 ('babe', 0.997617244720459)]

In [92]:
model.wv['kid'].shape

(100,)

In [94]:
import numpy as np
# Each word is represented by 100 dimension vector
def avg_word2vec(doc):
     # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    # return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
     #or [np.zeros(len(model.wv.index_to_key))], axis=0)
    vecs = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    if len(vecs) == 0:
        return np.zeros(model.vector_size)  # fallback: zero vector if nothing found
    return np.mean(vecs, axis=0)

In [96]:
!pip install tqdm 



In [98]:
from tqdm import tqdm

In [100]:
words[73]   #matrix (sentence,word)

['performed']

In [102]:
type(model.wv.index_to_key)

list

In [104]:
import numpy as np
#apply for the entire sentences
X=[]
valid_indices = []
for i in tqdm(range(len(words))):
    # print("Hello",i)
    # X.append(avg_word2vec(words[i]))
    vec = avg_word2vec(words[i])
    if vec is not None and not np.isnan(vec).any():
        X.append(vec)
        valid_indices.append(i)

100%|████████████████████████████████████| 5564/5564 [00:00<00:00, 13118.04it/s]


In [106]:
type(X)

list

In [108]:
X_new=np.array(X,dtype='float32')

In [110]:
X_new[0]

array([-0.10880505,  0.27740976,  0.13930932,  0.03647664,  0.0617937 ,
       -0.31541035,  0.07010501,  0.521972  , -0.15534362, -0.12523372,
       -0.14666188, -0.36398092, -0.022359  ,  0.13044451,  0.10534834,
       -0.268351  ,  0.01075804, -0.35647205,  0.00682838, -0.43245542,
        0.10826083,  0.15827677,  0.10367743, -0.1207981 , -0.10501587,
        0.0162136 , -0.19733544, -0.1626835 , -0.25172034,  0.03314524,
        0.29723397,  0.04954095,  0.14742942, -0.24211968, -0.13988516,
        0.27327913,  0.01157146, -0.21898073, -0.1680045 , -0.38461646,
        0.03229516, -0.22309512, -0.07871716,  0.07192492,  0.24881399,
       -0.12554364, -0.18871297,  0.01841715,  0.14145313,  0.22391765,
        0.15972622, -0.29443854, -0.03624031, -0.02666925, -0.14720272,
        0.19378023,  0.17512693, -0.05659993, -0.28508458,  0.03679183,
        0.08871748,  0.1440564 , -0.13791238,  0.02223338, -0.26754758,
        0.16819234,  0.0883702 ,  0.21039023, -0.26558873,  0.32

In [112]:
y_trimmed = [y[i] for i in valid_indices]

In [114]:
# Train Test Split
X_train,X_test,y_train,y_test=train_test_split(X_new,y_trimmed,test_size=0.2,random_state=0)

In [116]:
from sklearn.ensemble import RandomForestClassifier

In [118]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [120]:
print("🔢 Shape of X_new:", X_new.shape)
print("🔢 Length of y:", len(y))


🔢 Shape of X_new: (5564, 100)
🔢 Length of y: 5572


In [130]:
print(len(X_test), len(y_test))

1113 1113


In [132]:
y_pred_rf = rf_model.predict(X_test)

In [136]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(accuracy_score(y_test, y_pred_rf))

0.8463611859838275
