# Custom Word Vectors

## Custom word vectors: sum and average

In [73]:
import json
import time
import re

import numpy as np
import pandas as pd
import spacy
from gensim.models import Word2Vec, KeyedVectors
from gensim.utils import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from spacy.tokenizer import Tokenizer

N_NEIGHBOURS = 100

In [74]:
def load_dataset(path: str) -> pd.DataFrame:
    """
    Loads dataset

    :param path:
        dataset path
    :return:
        dataframe
    """
    df = pd.read_csv(path, sep="\t")
    df.drop(columns=df.columns[0], axis=1, inplace=True)

    return df

In [75]:
def vectorize(
    wv: KeyedVectors, document: list[str], tfidf_vectorizer: TfidfVectorizer = None
) -> np.ndarray:
    """
    Transforms documents to vectors

    :param wv: KeyedVectors
        vectors of all words from vocabulary
    :param document:
        input document from corpus
    :param tfidf_vectorizer: TfidfVectorizer
        TF-IDF vectorizer
    :return:
        vector representation of question
    """
    if SENTENCE_VECTOR_WEIGHT == "idf":
        idf = np.asarray(
            [
                tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_[token]]
                for token in document
            ]
        )
        document = np.asarray([wv[token] for token in document])
        document = idf[:, np.newaxis] * document
    elif SENTENCE_VECTOR_WEIGHT == "pos":
        doc = nlp(" ".join(document))
        pos = np.asarray([POS.get(token.pos_, 1.0) for token in doc])

        document = np.asarray([wv[token] for token in document])
        document = pos[:, np.newaxis] * document
    elif SENTENCE_VECTOR_WEIGHT == "ner":
        doc = nlp(" ".join(document))
        ner = np.asarray([NER.get(token.ent_type_, 1.0) for token in doc])

        document = np.asarray([wv[token] for token in document])
        document = ner[:, np.newaxis] * document
    elif SENTENCE_VECTOR_WEIGHT == "pos+ner":
        doc = nlp(" ".join(document))
        pos = np.asarray([POS.get(token.pos_, 1.0) for token in doc])
        ner = np.asarray([NER.get(token.ent_type_, 1.0) for token in doc])
        pos_ner = pos + ner

        document = np.asarray([wv[token] for token in document])
        document = pos_ner[:, np.newaxis] * document
    else:
        document = np.asarray([wv[token] for token in document])

    if SENTENCE_VECTOR_STRATEGY == "sum":
        document = np.sum(document, axis=0)
    elif SENTENCE_VECTOR_STRATEGY == "average":
        document = np.mean(document, axis=0)
    else:
        raise ValueError(
            f"Strategy {SENTENCE_VECTOR_STRATEGY} is not supported. Try 'sum' or 'average'"
        )

    return document

In [76]:
def check_performance(
    wv: KeyedVectors,
    knn: NearestNeighbors,
    corpus: list[list[str]],
    tfidf_vectorizer: TfidfVectorizer = None
) -> float:
    """
    Calculate performance of finding similar questions

    :param wv: KeyedVectors
        vectors of all words from vocabulary
    :param knn: NearestNeighbors
        K-nearest neighbors
    :param corpus: list
        input corpus of documents
    :param tfidf_vectorizer: TfidfVectorizer
        TF-IDF vectorizer
    :return:
        score (lesser is better)
    """
    with open("../../../data/test_questions_json.json") as json_file:
        json_data = json.load(json_file)

    test_questions = json_data["question"]
    original = json_data["original"]

    test_questions = [list(tokenize(tq.lower())) for tq in test_questions]
    for i, tq in enumerate(test_questions):
        test_questions[i] = list(filter(lambda x: x in wv.index_to_key, tq))
    test_questions = np.asarray(
        [vectorize(wv, tq, tfidf_vectorizer) for tq in test_questions]
    )
    _, indices = knn.kneighbors(test_questions)

    original = [list(tokenize(o.lower())) for o in original]
    indices_original = np.asarray([corpus.index(o) for o in original])

    rank = np.where(indices == indices_original[:, None])[1]
    penalization = (indices_original.shape[0] - rank.shape[0]) * 2 * knn.n_neighbors
    score = (rank.sum() + penalization) / indices_original.shape[0]

    return score

In [77]:
df = load_dataset("../../../data/insurance_qna_dataset.csv")

questions = np.unique(df.iloc[:, 0].to_numpy())
questions = [list(tokenize(question.lower())) for question in questions]

wv = KeyedVectors.load("word2vec.wordvectors", mmap="r")

### Custom word vectors: sum

In [78]:
SENTENCE_VECTOR_STRATEGY = "sum"
SENTENCE_VECTOR_WEIGHT = "none"

vectorized_questions = np.asarray([vectorize(wv, question) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 70.13%


### Custom word vectors: average

In [79]:
SENTENCE_VECTOR_STRATEGY = "average"
SENTENCE_VECTOR_WEIGHT = "none"

vectorized_questions = np.asarray([vectorize(wv, question) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 70.13%


## Custom word vectors combined with IDF

In [80]:
tfidf_vectorizer = TfidfVectorizer(token_pattern="(?u)\\b\\w+\\b")
tfidf_vectorizer.fit_transform([" ".join(question) for question in questions])

<16896x3631 sparse matrix of type '<class 'numpy.float64'>'
	with 122954 stored elements in Compressed Sparse Row format>

### Custom word vectors combined with IDF: sum

In [81]:
SENTENCE_VECTOR_STRATEGY = "sum"
SENTENCE_VECTOR_WEIGHT = "idf"

vectorized_questions = np.asarray([vectorize(wv, question, tfidf_vectorizer) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions, tfidf_vectorizer)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 69.44%


### Custom word vectors combined with IDF: average

In [82]:
SENTENCE_VECTOR_STRATEGY = "average"
SENTENCE_VECTOR_WEIGHT = "idf"

vectorized_questions = np.asarray([vectorize(wv, question, tfidf_vectorizer) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions, tfidf_vectorizer)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 69.44%


## Custom word vectors combined with POS/NER

In [83]:
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r"\S+").match)

### Custom word vectors combined with POS: sum

In [84]:
SENTENCE_VECTOR_STRATEGY = "sum"
SENTENCE_VECTOR_WEIGHT = "pos"

POS = {"NOUN": 5.0, "PROPN": 6.0, "VERB": 2.0, "ADJ": 4.0}
vectorized_questions = np.asarray([vectorize(wv, question) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 70.30%


### Custom word vectors combined with POS: average

In [85]:
SENTENCE_VECTOR_STRATEGY = "average"
SENTENCE_VECTOR_WEIGHT = "pos"

POS = {"NOUN": 5.0, "PROPN": 6.0, "VERB": 2.0, "ADJ": 4.0}
vectorized_questions = np.asarray([vectorize(wv, question) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 70.30%


### Custom word vectors combined with NER: sum

In [86]:
SENTENCE_VECTOR_STRATEGY = "sum"
SENTENCE_VECTOR_WEIGHT = "ner"

NER = {"MONEY": 6.0, "CARDINAL": 5.0, "DATE ": 4.0, "FAC ": 4.0}
vectorized_questions = np.asarray([vectorize(wv, question) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 70.12%


### Custom word vectors combined with NER: average

In [87]:
SENTENCE_VECTOR_STRATEGY = "average"
SENTENCE_VECTOR_WEIGHT = "ner"

NER = {"MONEY": 6.0, "CARDINAL": 5.0, "DATE ": 4.0, "FAC ": 4.0}
vectorized_questions = np.asarray([vectorize(wv, question) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 70.12%


### Custom word vectors combined with POS+NER: sum

In [88]:
SENTENCE_VECTOR_STRATEGY = "sum"
SENTENCE_VECTOR_WEIGHT = "pos+ner"

POS = {"NOUN": 5.0, "PROPN": 6.0, "VERB": 2.0, "ADJ": 4.0}
NER = {"MONEY": 6.0, "CARDINAL": 5.0, "DATE ": 4.0, "FAC ": 4.0}
vectorized_questions = np.asarray([vectorize(wv, question) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 78.88%


### Custom word vectors combined with POS+NER: average

In [89]:
SENTENCE_VECTOR_STRATEGY = "average"
SENTENCE_VECTOR_WEIGHT = "pos+ner"

POS = {"NOUN": 5.0, "PROPN": 6.0, "VERB": 2.0, "ADJ": 4.0}
NER = {"MONEY": 6.0, "CARDINAL": 5.0, "DATE ": 4.0, "FAC ": 4.0}
vectorized_questions = np.asarray([vectorize(wv, question) for question in questions])
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(wv, knn, questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 78.88%
