# Word2Vector Embeddings

Using Word2Vector embeddings from the word2vec-google-news-300 to find a representative word embedding of trivial and non-trivial messages (from the training sentences). Classify each new sentence based on the Euclidean distance from the representative embeddings.

In [4]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim.downloader as api
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# word2vec-google-news-300 model used for embeddings
wv = api.load('word2vec-google-news-300')

In [None]:
checkmate_messages_df = pd.read_csv('../../src/data/CheckMate_Messages_Table.csv')

In [3]:
# Converting a sentence to a 300-dimensional vector 
# Each word -> 300-dimensional vector -> average all vectors in a sentence

# Note: Ignore any words not in corpus
# Note: Return None if no matching words in sentence 

def sentence_2_vector(sentence):
    message_vectors = []
    is_na = True
    for i in range(len(sentence)):
        if wv.__contains__(sentence[i]):
            message_vectors.append(wv.word_vec(sentence[i]))
            is_na = False
        else:
            pass
    if not is_na:
        ave_vector = np.average(message_vectors, axis=0, keepdims=True)
        return ave_vector
    else:
        return None

In [8]:
# Prepping data
checkmate_messages_df = pd.read_csv('../../src/data/CheckMate_Messages_Table.csv')
checkmate_messages_df.dropna()
checkmate_messages_df['is_trivial'] = (checkmate_messages_df['taggedCategory']=='Trivial')
# Not all sentences are imported as strings
checkmate_messages_df['text'] = checkmate_messages_df['text'].astype('str')

In [9]:
# Only retrieving text and is_trivial parameters
df = checkmate_messages_df[['text','is_trivial']]

In [10]:
# Train test split at 50%
df_train, df_test= train_test_split(
        df, test_size=0.50, random_state=42)

In [11]:
# Creating representative vector embeddings for is_trivial and not_trivial
df_train_is_trivial = df_train[df_train['is_trivial']==True]
df_train_not_trivial = df_train[df_train['is_trivial']==False]

vectors_train_is_trivial = df_train_is_trivial['text'].map(sentence_2_vector).dropna()
vectors_train_not_trivial = df_train_not_trivial['text'].map(sentence_2_vector).dropna()

# df_train_is_trivial.dropna(inplace=True)
# df_train_not_trivial.dropna(inplace=True)

  message_vectors.append(wv.word_vec(sentence[i]))


In [13]:
ave_vector_is_trivial = np.average(vectors_train_is_trivial)
ave_vector_not_trivial = np.average(vectors_train_not_trivial)

In [15]:
# Classify the sentences based on euclidean distances from the representative vectors
def classify_is_trivial(v):
    d_is_trivial = np.linalg.norm(ave_vector_is_trivial - v)
    d_not_trivial = np.linalg.norm(ave_vector_not_trivial - v)
    # print(f'distance from trivial:{d_is_trivial}, not trivial:{d_not_trivial}')
    if d_is_trivial<= d_not_trivial:
        return True
    else:
        return False

In [17]:
# Create a series containing the average vector represenations of each sentence
vectors_test = df_test['text'].map(sentence_2_vector).dropna()

  message_vectors.append(wv.word_vec(sentence[i]))


In [18]:
# Create prediction for each sentence in series 
prediction = (vectors_test.map(classify_is_trivial))

In [19]:
# Concat the prediction and true values for the comparison
predicted_vs_output = pd.concat([df_test['is_trivial'], prediction], axis=1)
predicted_vs_output = predicted_vs_output.rename(columns={'is_trivial':'target', 'text':'prediction'})

In [24]:
# Calculate the recall of this methodology
true_positive = len(predicted_vs_output[(predicted_vs_output['target']==True) & (predicted_vs_output['prediction']==True)])
false_negative = len(predicted_vs_output[(predicted_vs_output['target']==True) & (predicted_vs_output['prediction']==False)])
false_positive = len(predicted_vs_output[(predicted_vs_output['target']==False) & (predicted_vs_output['prediction']==True)])

recall = true_positive/(true_positive+false_negative)

In [26]:
print(recall)
print(true_positive)
print(false_negative)
print(false_positive)

0.6666666666666666
8
4
1
