## Spam Classification

In [34]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
messages = pd.read_csv('dataset/smsspamcollection/SMSSpamCollection', sep='\t', names=['label', 'message'])

In [7]:
len(messages)

5572

In [10]:
messages['message'].loc[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [3]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Remove unnecessary words

In [35]:
import re
lemmatizer = WordNetLemmatizer()

In [39]:
corpus = []

for i in range(len(messages)):
    message = re.sub('[^a-zA-Z]', ' ', messages['message'].iloc[i])
    message = message.lower().strip()
    words = nltk.word_tokenize(message)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    message = ' '.join(words)
    corpus.append(message)
    

In [43]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

## Text Vectorization Using Bag Of Words

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [66]:
y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values
y

array([False, False,  True, ..., False, False, False])

In [67]:
# Train, Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state= 0)

In [68]:
# Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [69]:
# Prediction
y_pred = spam_detect_model.predict(X_test)
y_pred

array([False,  True, False, ..., False,  True, False])

In [70]:
# Model Accuracy
from sklearn.metrics import accuracy_score, classification_report
score = accuracy_score(y_test, y_pred)
score

0.9838565022421525

In [72]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       957
        True       0.94      0.95      0.94       158

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



## Text Vectorization Using TFIDF

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [74]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state= 0)

In [75]:
# Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [76]:
# Prediction
y_pred = spam_detect_model.predict(X_test)

In [77]:
# Accuracy
score = accuracy_score(y_test, y_pred)
score

0.9721973094170404

In [78]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       False       1.00      0.97      0.98       986
        True       0.81      1.00      0.89       129

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115



## Text Vectorization Using word2Vec

In [80]:
from nltk import sent_tokenize
import gensim
from gensim.utils import simple_preprocess

In [81]:
words = []
for message in corpus:
    sentences = sent_tokenize(message)
    for sentence in sentences:
        words.append(simple_preprocess(sentence))

In [83]:
# Gensim word2Vec Model
model = gensim.models.Word2Vec(
    window=5,
    min_count=2,
    vector_size=300
)

In [84]:
model.build_vocab(words)

In [87]:
print(model.epochs)
print(model.corpus_count)

5
5564


In [88]:
model.train(words, total_examples=model.corpus_count, epochs=model.epochs)

(203067, 238325)

In [89]:
model.wv.similar_by_word('kid')

[('box', 0.9986541271209717),
 ('like', 0.9986386299133301),
 ('today', 0.9986283779144287),
 ('co', 0.9986222982406616),
 ('ask', 0.9986222386360168),
 ('ur', 0.9986168742179871),
 ('work', 0.9986156225204468),
 ('day', 0.9986129403114319),
 ('start', 0.998611330986023),
 ('hav', 0.9986103177070618)]