#### Import Trained Word2Vector model

In [1]:
from gensim.models import Word2Vec, KeyedVectors

In [4]:
wv=KeyedVectors.load_word2vec_format("data/GoogleNews-vectors-negative300.bin", binary=True)

In [6]:
vec_king = wv['King']

In [8]:
vec_king

array([-0.00350952,  0.01623535, -0.08154297,  0.12792969,  0.11230469,
       -0.00534058,  0.03063965,  0.04931641,  0.22070312,  0.07373047,
       -0.13769531,  0.16210938,  0.02148438, -0.09375   , -0.12792969,
       -0.12402344, -0.11132812,  0.11816406, -0.07861328,  0.25390625,
        0.01794434,  0.14160156,  0.0612793 , -0.08691406,  0.07763672,
        0.05175781, -0.24609375, -0.17578125,  0.14746094,  0.06640625,
       -0.03833008, -0.09033203, -0.07226562,  0.09375   , -0.18847656,
        0.06347656,  0.24121094,  0.00714111, -0.30273438, -0.02478027,
       -0.09619141, -0.30859375, -0.06054688,  0.22167969,  0.07763672,
        0.05834961,  0.15527344, -0.13476562, -0.00341797, -0.13964844,
       -0.02905273,  0.03833008, -0.15332031, -0.20996094,  0.21679688,
        0.01171875, -0.078125  ,  0.00402832, -0.23535156, -0.10400391,
        0.08837891,  0.25976562,  0.02709961,  0.01123047,  0.12988281,
       -0.11914062, -0.07861328, -0.04736328, -0.06591797,  0.07

In [10]:
wv.most_similar('football')

[('soccer', 0.7313548922538757),
 ('fooball', 0.7139959335327148),
 ('Football', 0.7124834060668945),
 ('basketball', 0.668246865272522),
 ('footbal', 0.6649289727210999),
 ('athletics', 0.6265192627906799),
 ('gridiron', 0.6191604733467102),
 ('baseball', 0.6162001490592957),
 ('footballl', 0.6069177389144897),
 ('sports', 0.5927178859710693)]

In [12]:
wv.most_similar('Python')

[('Jython', 0.6152505874633789),
 ('Perl_Python', 0.5710949897766113),
 ('IronPython', 0.5704678297042847),
 ('scripting_languages', 0.5695092082023621),
 ('PHP_Perl', 0.5687724947929382),
 ('Java_Python', 0.5681070685386658),
 ('PHP', 0.5660915970802307),
 ('Python_Ruby', 0.5632462501525879),
 ('Visual_Basic', 0.5603480339050293),
 ('Perl', 0.5530890822410583)]

In [16]:
wv.similarity('Java', 'Python')

0.4618971

#### Create a bag of word model from spam dataset

In [47]:
import pandas as pd
spam_df = pd.read_csv('data/spam.csv', usecols=['v1', 'v2'], encoding='latin1')
spam_df.rename(columns={'v1': 'Label', 'v2': 'Message'}, inplace=True)

In [51]:
spam_df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [63]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to C:\Users\Uthanda
[nltk_data]     Ramu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [65]:
ps = PorterStemmer()

In [67]:
corpus = []
for i in range(len(spam_df)):
    review = re.sub('[^a-zA-Z]', ' ', spam_df["Message"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)

#### Using CounterVectorizer and TfidfVectorizer for tokenization

In [54]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer(max_features=2500, ngram_range=(1,2))
tfidf = TfidfVectorizer(max_features=2500, ngram_range=(1,2))

In [71]:
x_cv = cv.fit_transform(corpus).toarray()
x_tfidf = tfidf.fit_transform(corpus).toarray()

In [77]:
y = pd.get_dummies(spam_df['Label'])
y = y.iloc[:, 0].values

In [79]:
y.shape

(5572,)

#### Using CounterVectorizer and TfidfVectorizer tokens for training

In [87]:
from sklearn.model_selection import train_test_split
x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(x_cv, y, test_size=0.2, random_state=21)
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(x_tfidf, y, test_size=0.2, random_state=23)

In [91]:
#Training
from sklearn.naive_bayes import MultinomialNB
spam_detect_model_cv = MultinomialNB().fit(x_train_cv, y_train_cv)
spam_detect_model_tfidf = MultinomialNB().fit(x_train_tfidf, y_train_tfidf)

In [93]:
#Prediction
y_pred_cv = spam_detect_model_cv.predict(x_test_cv)
y_pred_tfidf = spam_detect_model_tfidf.predict(x_test_tfidf)

In [101]:
#Metrics evaluation on cv tokens
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test_cv, y_pred_cv))
print(classification_report(y_test_cv, y_pred_cv))

0.9847533632286996
              precision    recall  f1-score   support

       False       0.94      0.94      0.94       140
        True       0.99      0.99      0.99       975

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [103]:
#Metrics evaluation on tfidf tokens
print(accuracy_score(y_test_tfidf, y_pred_tfidf))
print(classification_report(y_test_tfidf, y_pred_tfidf))

0.9802690582959641
              precision    recall  f1-score   support

       False       0.99      0.85      0.91       136
        True       0.98      1.00      0.99       979

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

