In [None]:
# For installing nltk and spacy with correct numpy version
# !pip install --upgrade pip
# !pip install --upgrade --force-reinstall numpy
# !pip install --upgrade --force-reinstall scikit-learn nltk spacy
# !python -m spacy download en_core_web_sm
# !pip install gensim nltk


In [6]:
import nltk
import spacy
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [7]:
df = pd.read_csv("/kaggle/input/sms-spam-classification-using-word2vec/spam.csv", encoding = "latin1", header = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [8]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis =  1, inplace= True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [10]:
df.rename({'v1': 'labels', 'v2': 'message'}, axis = 1, inplace = True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   labels   5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [12]:
y = np.where(df['labels'] == 'spam', 1, 0)

In [13]:
y.shape

(5572,)

In [14]:
nlp = spacy.load('en_core_web_sm')

def smart_case(sentence):
    doc = nlp(sentence)
    processed = []

    for token in doc:
        if token.ent_type_ or token.pos_ in ['PROPN'] or token.text.isupper(): # Always use '_'.
            processed.append(token.text)
        else:
            processed.append(token.text.lower())

    return ' '.join(processed)
    

In [15]:
import re

lemmatizer = WordNetLemmatizer() # Using this with Word2Vec is only applicabel if the dataset is small
corpus = []
final_corpus = []

for i in range(len(df['message'])):
    message = df['message'][i]
    message = re.sub('[^a-zA-Z0-9\s!$#%@=&]', ' ', message)
    message = smart_case(message)
    tokens = nltk.word_tokenize(message)
    message = [lemmatizer.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
    corpus.append(' '.join(message))


final_corpus = [nltk.word_tokenize(doc) for doc in corpus] # Tokenizing again since we need a list of words for each sentence.

final_corpus[:5]
    
    
    

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'Cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  '2',
  'wkly',
  'comp',
  'win',
  'FA',
  'Cup',
  'final',
  'tkts',
  '21st',
  'May',
  '2005',
  'text',
  'FA',
  '87121',
  'receive',
  'entry',
  'question',
  'std',
  'txt',
  'rate',
  'T',
  '&',
  'C',
  'apply',
  '08452810075over18'],
 ['U', 'dun', 'say', 'early', 'hor', 'U', 'c', 'already', 'say'],
 ['nah', 'I', 'think', 'go', 'usf', 'life', 'around', 'though']]

In [16]:
w2v_model = Word2Vec(final_corpus, window = 5, vector_size = 100, min_count = 2, workers = 4) # use 50-100 for <10k, 100-200 for 10k-1M, and 200-300 for wikipedia, blogs.

def get_average_vectors(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis = 0) if len(vectors) > 0 else np.zeros(model.vector_size) # If the length of the vectors is , 0 then it means the words in the sentence are not understood by the w2v_model or those words may be rare which may not be usefull(noise). That's why I am filling the vector with zeros in order to context it as 'no information' to the classifier_model.

X = np.array([get_average_vectors(doc, w2v_model) for doc in final_corpus]) # This will take the average arrays of all the dimension of all words which will happen for each sentence.
   


# Model Training

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42, test_size = 0.2)

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(f"Classification Report:\n   {classification_report(y_test, y_pred)}\n\nAccuracy: {accuracy_score(y_test, y_pred)}")
print("Confusion Matrix:\n    ", confusion_matrix(y_test, y_pred))

  The metric, modifier and average arguments are used only for determining
  The metric, modifier and average arguments are used only for determining
  The metric, modifier and average arguments are used only for determining


Classification Report:
                 precision    recall  f1-score   support

           0       0.87      1.00      0.93       965
           1       0.00      0.00      0.00       150

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115


Accuracy: 0.8654708520179372
Confusion Matrix:
     [[965   0]
 [150   0]]


## Reasons Behind Bad Results:
- ### We are getting bad results for both balanced and imbalanced data when compared to using TF-IDF(in previous project) because, the averaging of words in a sentence distorts the meaning of rare words.

- ### Eg., "Win a free iphone now." is a sentence and words like "win", "free" are rare/unique. When the model averages these words vector, it losts the meaning/context of rare words, causing lack of lack of contextual understanding in model.

## Improvements:
- ### W2V works well in large dataset.