# Word2Vector Embeddings

Using Word2Vector embeddings from the word2vec-google-news-300 to find a representative word embedding of trivial and non-trivial messages (from the training sentences). Classify each new sentence based on the Euclidean distance from the representative embeddings.

In [5]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
# word2vec-google-news-300 model used for embeddings
wv = api.load('word2vec-google-news-300')

In [7]:
checkmate_messages_df = pd.read_csv('../../src/data/CheckMate_Messages_Table.csv')

In [8]:
# Converting a sentence to a 300-dimensional vector 
# Each word -> 300-dimensional vector -> average all vectors in a sentence

# Note: Ignore any words not in corpus
# Note: Return None if no matching words in sentence 

def sentence_2_vector(sentence):
    message_vectors = []
    is_na = True
    for i in range(len(sentence)):
        if wv.__contains__(sentence[i]):
            message_vectors.append(wv.word_vec(sentence[i]))
            is_na = False
        else:
            pass
    if not is_na:
        ave_vector = np.average(message_vectors, axis=0, keepdims=True)
        return ave_vector
    else:
        return None

In [9]:
# Prepping data
checkmate_messages_df = pd.read_csv('../../src/data/CheckMate_Messages_Table.csv')
checkmate_messages_df.dropna()
checkmate_messages_df['is_trivial'] = (checkmate_messages_df['taggedCategory']=='Trivial')
# Not all sentences are imported as strings
checkmate_messages_df['text'] = checkmate_messages_df['text'].astype('str')

In [10]:
# Only retrieving text and is_trivial parameters
df = checkmate_messages_df[['text','is_trivial']]

In [22]:
# Train test split at 50%
df_train, df_test= train_test_split(
        df, test_size=0.50, random_state=42, stratify=df['is_trivial'])

In [23]:
# Creating representative vector embeddings for is_trivial and not_trivial
df_train_is_trivial = df_train[df_train['is_trivial']==True]
df_train_not_trivial = df_train[df_train['is_trivial']==False]

vectors_train_is_trivial = df_train_is_trivial['text'].map(sentence_2_vector).dropna()
vectors_train_not_trivial = df_train_not_trivial['text'].map(sentence_2_vector).dropna()

# df_train_is_trivial.dropna(inplace=True)
# df_train_not_trivial.dropna(inplace=True)

  message_vectors.append(wv.word_vec(sentence[i]))


In [24]:
ave_vector_is_trivial = np.average(vectors_train_is_trivial)
ave_vector_not_trivial = np.average(vectors_train_not_trivial)

In [25]:
# Classify the sentences based on euclidean distances from the representative vectors
def classify_is_trivial(v):
    d_is_trivial = np.linalg.norm(ave_vector_is_trivial - v)
    d_not_trivial = np.linalg.norm(ave_vector_not_trivial - v)
    # print(f'distance from trivial:{d_is_trivial}, not trivial:{d_not_trivial}')
    if d_is_trivial<= d_not_trivial:
        return True
    else:
        return False

In [26]:
# Create a series containing the average vector represenations of each sentence
vectors_test = df_test['text'].map(sentence_2_vector).dropna()
print(vectors_test)

77    [[-0.12804633, 0.07846623, -0.0028197807, 0.12...
31    [[0.0596852, -0.13528802, 0.097473145, -0.0677...
61    [[-0.19146292, 0.111301966, 0.002633231, 0.150...
67    [[-0.15674461, 0.10766732, -0.0025885978, 0.13...
12    [[-0.17181522, 0.1035043, -0.0022740315, 0.133...
16    [[-0.1640625, 0.09762974, -0.023943044, 0.1348...
30    [[-0.11051596, 0.062171426, 0.02830754, 0.1142...
58    [[-0.16927543, 0.061089292, 0.0279886, 0.13594...
13    [[-0.17247473, 0.113895744, 0.0021641373, 0.12...
43    [[-0.25672743, 0.13237847, -0.003458659, 0.140...
17    [[-0.18310767, 0.080474116, 0.004465126, 0.141...
38    [[-0.14813232, 0.055023193, -0.03371175, 0.152...
32    [[-0.16450882, 0.095428996, 0.014264863, 0.129...
40    [[-0.21744792, 0.12727864, 0.009562175, 0.1368...
66    [[-0.16306363, 0.088267356, 0.013311978, 0.138...
68    [[-0.14635438, 0.09423035, 0.022300415, 0.1275...
44    [[-0.15293118, 0.09489967, 0.024162214, 0.1230...
27    [[-0.16092487, 0.10481708, 0.0009290307, 0

  message_vectors.append(wv.word_vec(sentence[i]))


In [27]:
# Create prediction for each sentence in series 
prediction = (vectors_test.map(classify_is_trivial))
print(prediction)

77    False
31    False
61    False
67    False
12    False
16    False
30    False
58    False
13    False
43    False
17    False
38    False
32    False
40    False
66    False
68    False
44    False
27    False
48    False
70     True
26    False
1     False
41     True
23    False
2     False
53    False
35    False
76    False
63    False
8     False
55    False
72    False
29    False
24    False
62    False
18    False
69     True
51    False
25    False
11    False
46     True
85    False
21    False
Name: text, dtype: bool


In [28]:
# Concat the prediction and true values for the comparison
predicted_vs_output = pd.concat([df_test['text'], df_test['is_trivial'], prediction], axis=1)
# predicted_vs_output = predicted_vs_output.rename(columns={'is_trivial':'target', 'text':'prediction'})
predicted_vs_output.columns = ['text', 'target','prediction']

print(predicted_vs_output)

                                                 text  target  prediction
77  Fast Cash Service                \n3k 268 x12m...   False       False
31  中国确实真的有5千年历史啊！这是对的啊！谁人的屁是香的啊！老外的屁也是臭味熏天的啊！他们还有...   False       False
61                               can u read pictures?    True       False
67  Hello, We have noticed your employment history...   False       False
12  Hey Jolyn! We haven't seen you in the studio y...   False       False
16  So...... Starts walking!!!\n\nhttps://www.thev...   False       False
30  Date: 24 March 2023 at 2:24:03 pm AEDT\nTo: \n...   False       False
58  Hi good evening. How u get my Number ? My side...   False       False
13  Hi all, especially for our younger colleagues ...   False       False
43                                     is this a scam    True       False
17  Suck lozenges n  rub tiger balm on nose , last...    True       False
38                             9/11 attack was a scam   False       False
32  <ADV> Singapore's medical cost is 

In [29]:
# print(predicted_vs_output)
print(predicted_vs_output[(predicted_vs_output['target']==False) & (predicted_vs_output['prediction']==True)])

                                                 text  target  prediction
41  Hello \nsorry to bother you, is this Kevin Wong ?   False        True


In [30]:
# Calculate the recall of this methodology
true_positive = len(predicted_vs_output[(predicted_vs_output['target']==True) & (predicted_vs_output['prediction']==True)])
true_negative = len(predicted_vs_output[(predicted_vs_output['target']==False) & (predicted_vs_output['prediction']==False)])

false_negative = len(predicted_vs_output[(predicted_vs_output['target']==True) & (predicted_vs_output['prediction']==False)])
false_positive = len(predicted_vs_output[(predicted_vs_output['target']==False) & (predicted_vs_output['prediction']==True)])

recall = true_positive/(true_positive+false_negative)
accuracy = (true_positive+true_negative)/(true_positive+true_negative+false_positive+false_negative)

In [31]:
print(recall)
print(accuracy)

print(true_positive)
print(true_negative)
print(false_negative)
print(false_positive)

0.25
0.7674418604651163
3
30
9
1


Results:
Recall = 66.6% 
Accuracy = 87.8%

In [33]:
# However in the context of this case we care about how many were classified correct over all classified correctly + those we flag as trivial but actually not trivial

recall_contextualised = true_negative/(true_negative+false_positive)
print(f'Contextualised recall:{recall_contextualised}')

Contextualised recall:0.967741935483871


Room for improvement:
1. Tokens are generated by a simple split(,) function. Using a tokeniser trained on the english might produce better results
2. Using a encoder like bert trained on a larger corpus could give better results than word2vec