# Implementation of Word2Vec and AvgWord2Vec: SMS Spam/Ham identifier

In [30]:
import pandas as pd
import nltk
import numpy as np

df = pd.read_csv("resources/6.sms_spam.csv", encoding='latin-1')
df.drop(["Unnamed: 2",	"Unnamed: 3",	"Unnamed: 4"], axis=1, inplace=True)
df.rename({"v1":"label", "v2": "message"}, axis=1, inplace=True)
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [18]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sumeetjadhav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
corpus = []

for message in df.message:
  reviews = re.sub('[^a-zA-Z]',' ', message)
  reviews = reviews.lower()
  reviews = reviews.split() ## converting sentence to words
  reviews = [lemmatizer.lemmatize(review) for review in reviews if review not in set(stopwords.words('english'))]
  reviews = (' ').join(reviews)
  corpus.append(reviews)

or 1 more method for data cleaning

In [20]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

words = []
for sent in df.message:
  sentance_tokenize = sent_tokenize(sent)
  for word in sentance_tokenize:
    words.append(simple_preprocess(word))

In [21]:
words

[['go', 'until', 'jurong', 'point', 'crazy'],
 ['available',
  'only',
  'in',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar'],
 ['joking', 'wif', 'oni'],
 ['free',
  'entry',
  'in',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may'],
 ['text',
  'fa',
  'to',
  'to',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'apply',
  'over'],
 ['dun', 'say', 'so', 'early', 'hor', 'already', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'goes',
  'to',
  'usf',
  'he',
  'lives',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back'],
 ['like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still'],
 ['tb', 'ok'],
 ['xxx', 'std', 'chgs', 'to', 'send', 'to', 'rcv'],
 ['even', 'my', 'brother', 'is', 'not', 'like', 'to', 'speak', 'with', 'me'],
 ['they', 'treat', '

In [22]:
import gensim

### Train Word2Vec from Scratch

In [23]:
model = gensim.models.Word2Vec(words) ## we can try different combinations like epochs, vector_size etc

#### To get All the Vocabulary

In [24]:
model.wv.index_to_key

['you',
 'to',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'for',
 'your',
 'of',
 'call',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'at',
 'get',
 'be',
 'if',
 'ur',
 'will',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'how',
 'up',
 'when',
 'ok',
 'what',
 'free',
 'go',
 'from',
 'all',
 'out',
 'll',
 'know',
 'like',
 'good',
 'then',
 'am',
 'got',
 'day',
 'there',
 'was',
 'come',
 'he',
 'its',
 'time',
 'only',
 'love',
 'send',
 'want',
 'text',
 'txt',
 'as',
 'one',
 'going',
 'by',
 'need',
 'home',
 'about',
 'she',
 'stop',
 'don',
 'lor',
 'today',
 'sorry',
 'see',
 'still',
 'back',
 'da',
 'our',
 'reply',
 'dont',
 'mobile',
 'take',
 'tell',
 'hi',
 'new',
 'they',
 'later',
 'pls',
 'any',
 'her',
 'please',
 'think',
 'did',
 'been',
 'some',
 'phone',
 'week',
 'dear',
 'here',
 'ì_',
 'where',
 'who',
 'well',
 'has',
 're',
 'much',
 'an',
 'night',
 'great',
 'oh',
 'msg',
 'hope',
 'claim',


In [25]:
model.corpus_count ## number of words in vocabulary

11124

In [26]:
model.epochs ## How many epoch i.e. iterations done || more the epoch more the accuracy

5

In [27]:
model.wv.similar_by_word('good')

[('great', 0.998639702796936),
 ('last', 0.9985666275024414),
 ('day', 0.9984936118125916),
 ('as', 0.9984835982322693),
 ('love', 0.9984461069107056),
 ('amp', 0.9984456300735474),
 ('well', 0.9984280467033386),
 ('night', 0.9984150528907776),
 ('god', 0.9983620643615723),
 ('its', 0.9983481764793396)]

In [28]:
model.wv['good'].shape ## means 100 dimesion is created

(100,)

Average Word2VEc

In [31]:
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key])

In [32]:
from tqdm import tqdm

In [33]:
X=[]

for i in tqdm(range(len(words))):
  X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 11124/11124 [00:00<00:00, 35577.56it/s]


In [34]:
X

[0.0029670596,
 0.0035939803,
 0.0034165138,
 0.001519075,
 0.0024457625,
 0.0032480755,
 0.0043663243,
 0.0040844055,
 0.004245602,
 0.005944816,
 0.0039488976,
 0.0035773434,
 0.004366,
 0.0041299476,
 0.0035804391,
 0.0026352867,
 0.0008866529,
 0.0041739247,
 0.0036268607,
 0.0013421496,
 0.002722503,
 0.0038887493,
 0.0033202828,
 0.0030766246,
 0.004434997,
 0.004118367,
 0.0036363753,
 0.003967396,
 0.0024494939,
 0.0020448577,
 0.004323588,
 0.0031462153,
 0.00457726,
 0.004001688,
 0.0051878546,
 0.0038878333,
 nan,
 0.0030456856,
 0.004736719,
 0.0046992567,
 0.0028664565,
 0.003813619,
 0.0032762093,
 0.0040629,
 0.0035326723,
 0.0029777323,
 0.0027065226,
 0.0045021195,
 0.0033510923,
 0.0040988987,
 0.004278135,
 0.0032145146,
 0.003874075,
 nan,
 0.0053758863,
 0.0034538326,
 0.0031501662,
 0.0029560202,
 0.0021515114,
 0.0033491135,
 0.0031624981,
 0.0040537124,
 0.004044743,
 0.005609955,
 0.00458437,
 0.0041164756,
 0.006243728,
 0.0054213754,
 nan,
 0.005553129,
 0.00

In [35]:
## independent feature
X_new =np.array(X)

X_new.shape

(11124,)

In [36]:
## dependent Features
## output features
y = pd.get_dummies(df['label'])
y = y.iloc[:,0].values

In [37]:
y.shape

(5572,)