In [1]:
from nltk.corpus import stopwords
import pandas as pd
from nltk.stem import PorterStemmer
import re
import numpy as np

In [2]:
data = pd.read_csv('D://ml_datasets//NLP-Live-main//smsspamcollection//SMSSpamCollection', sep='\t', names=["label", "message"])

In [3]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#used for stemming
ps = PorterStemmer()

In [5]:
corpus = []
for i in range(0, len(data)):
    message = re.sub('[^a-zA-Z0-9]', ' ', data['message'][i])
    message = message.lower()
    message = message.split()
    # stem each word if not a stopword
    message = [ps.stem(word) for word in message if not word in stopwords.words('english')]
    message = ' '.join(message)
    corpus.append(message)
        

In [6]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl 3 week word back like fun still tb ok xxx std chg send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6day 16 tsandc appli repli hl 4 info',
 'urgent 1 week free mem

In [7]:
#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
# max features means vocabulary range ngram(1,2) uni gram and bi gram 
cv = CountVectorizer(max_features=2500, ngram_range=(1,2), binary=True)
X = cv.fit_transform(corpus)

In [8]:
y = pd.get_dummies(data['label'])
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
# random forest 
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier().fit(X_train,y_train)

In [11]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_pred=y_pred, y_true=y_test)
accuracy

0.9847533632286996

In [12]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       972
           1       0.89      1.00      0.94       143

    accuracy                           0.98      1115
   macro avg       0.95      0.99      0.97      1115
weighted avg       0.99      0.98      0.99      1115



In [13]:
#w2v
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

words = []
for i in range(0,len(data)):
    tokens = sent_tokenize(corpus[i])
    for token in tokens:
        words.append(simple_preprocess(token))


In [14]:
words

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  'st',
  'may',
  'text',
  'fa',
  'receiv',
  'entri',
  'question',
  'std',
  'txt',
  'rate',
  'appli',
  'over'],
 ['dun', 'say', 'earli', 'hor', 'alreadi', 'say'],
 ['nah', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  'copi',
  'friend',
  'callertun'],
 ['winner',
  'valu',
  'network',
  'custom',
  'select',
  'receivea',
  'prize',
  'r

In [15]:
from gensim.models import Word2Vec
w2v = Word2Vec(words, window=5, min_count=2)

In [16]:
#total vocabulary
len(w2v.wv.index2word)

3262

In [17]:
w2v.wv['embarass']

array([ 3.37784849e-02, -4.83742654e-02,  1.08175613e-02, -4.76229750e-03,
       -8.03529192e-03,  2.89301574e-02, -2.06888895e-02,  5.10035977e-02,
       -3.78306326e-03, -3.41841672e-03, -4.92423624e-02,  1.32310186e-02,
        4.63477522e-02, -3.00128311e-02, -7.30480775e-02, -7.95302540e-03,
       -2.78253499e-02, -3.33677977e-02,  1.05850911e-02, -1.25318579e-02,
        3.93839292e-02,  2.43021511e-02,  3.99931259e-02, -8.43819976e-03,
       -4.93680965e-03, -5.64196296e-02,  5.31646609e-02,  4.77422960e-02,
        2.83279479e-03, -5.05278297e-02, -7.03219250e-02,  2.52771545e-02,
       -1.23778463e-03, -6.78396504e-03,  1.25453901e-02,  1.85143836e-02,
       -1.18241515e-02,  9.79369786e-03, -4.96764630e-02,  3.05847377e-02,
        4.63703647e-02,  3.74353003e-05,  1.64784323e-02, -6.17898926e-02,
        5.80048412e-02, -5.86369298e-02,  1.95320267e-02, -1.98374782e-02,
       -6.37475476e-02,  5.02897464e-02,  9.84280556e-02, -9.53250658e-03,
        2.35697273e-02, -

In [18]:
def avg_word2vec(sentence):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    doc = sentence.split()
    return np.mean([w2v.wv[word] for word in doc if word in w2v.wv.index2word],axis=0)

In [28]:
X_w2v = []

for i in range(len(data)):
    X_w2v.append(avg_word2vec(data['message'][i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [42]:
len(X_w2v)

5572

In [30]:
#some array are empty remove them
def is_non_empty_iterable(x):
    return isinstance(x, (tuple, list, np.ndarray)) and len(x) > 0

In [44]:
filtered_X = []
filtered_y = []
for x, y in zip(X_w2v, y):
    # Check if x is a tuple/list/array and not empty
    if isinstance(x, (tuple, list, np.ndarray)) and len(x) > 0:
        filtered_X.append(x)
        filtered_y.append(y)


In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(filtered_X, filtered_y, test_size=0.2, random_state=0)

In [50]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier().fit(X_train,y_train)

In [52]:
y_pred = classifier.predict(X_test)

In [53]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_pred,y_test))

0.9252988047808764


In [54]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.92      0.96       890
           1       0.61      0.94      0.74       114

    accuracy                           0.93      1004
   macro avg       0.80      0.93      0.85      1004
weighted avg       0.95      0.93      0.93      1004

