# Term Frequency

## Term Frequency with different distance metrics

In [149]:
import json

import nltk
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

N_NEIGHBOURS = 100

In [150]:
def check_performance(vectorizer: CountVectorizer, knn: NearestNeighbors, vectorized_questions: csr_matrix) -> float:
    """
    Calculate performance of finding similar questions

    :param vectorizer: CountVectorizer
        term frequency vectorizer
    :param knn: NearestNeighbors
        K-nearest neighbors
    :param vectorized_questions: csr_matrix
        input questions transformed with count vectorizer
    :return:
        score (lesser is better)
    """
    with open("../../data/test_questions_json.json") as json_file:
        json_data = json.load(json_file)

    test_questions = json_data["question"]
    original = json_data["original"]

    test_questions = vectorizer.transform(test_questions)
    _, indices = knn.kneighbors(test_questions.toarray())

    original = vectorizer.transform(original)
    indices_original = np.where((vectorized_questions.toarray() == original.toarray()[:, None]).all(-1))[1]

    rank = np.where(indices == indices_original[:, None])[1]
    penalization = (indices_original.shape[0] - rank.shape[0]) * 2 * N_NEIGHBOURS
    score = (rank.sum() + penalization) / indices_original.shape[0]

    return score

In [151]:
df = pd.read_csv("../../data/insurance_qna_dataset.csv", sep="\t")
df.drop(columns=df.columns[0], axis=1, inplace=True)

vectorizer = CountVectorizer(lowercase=True)
questions = np.unique(df.iloc[:, 0].to_numpy())
vectorized_questions = vectorizer.fit_transform(questions)

### Euclidean metric

In [152]:
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="euclidean").fit(vectorized_questions.toarray())

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 78.09%


### Manhattan (cityblock) metric

In [153]:
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cityblock").fit(vectorized_questions.toarray())

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 77.89%


### Cosine metric

In [154]:
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions.toarray())

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 79.48%


## Comparison between different numbers of neighbours

In [155]:
N_NEIGHBOURS_GRID = (1, 5, 10, 25, 50, 100, 150, 200, 300, 400, 500, 10000, 16000)
for N_NEIGHBOURS in N_NEIGHBOURS_GRID:
    knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions.toarray())

    score = check_performance(vectorizer, knn, vectorized_questions)
    print(f"Number of neighbours: {N_NEIGHBOURS} | Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")
N_NEIGHBOURS = 100

Number of neighbours: 1 | Score: 45.00%
Number of neighbours: 5 | Score: 55.00%
Number of neighbours: 10 | Score: 61.83%
Number of neighbours: 25 | Score: 69.07%
Number of neighbours: 50 | Score: 73.43%
Number of neighbours: 100 | Score: 79.48%
Number of neighbours: 150 | Score: 82.63%
Number of neighbours: 200 | Score: 83.77%
Number of neighbours: 300 | Score: 84.48%
Number of neighbours: 400 | Score: 87.17%
Number of neighbours: 500 | Score: 88.67%
Number of neighbours: 10000 | Score: 99.47%
Number of neighbours: 16000 | Score: 99.67%


## Term Frequency with N-grams

In [156]:
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 2))

questions = np.unique(df.iloc[:, 0].to_numpy())
vectorized_questions = vectorizer.fit_transform(questions)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions.toarray())

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 78.88%


## Term Frequency with Lemmatization

In [157]:
def lemmatize_sentence(sentence: str) -> str:
    """
    Lemmatize the input sentence and return processed sentence

    :param sentence: str
        sentence to be lemmatized
    :return:
        lemmatized sentence
    """
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)

In [158]:
def check_performance(vectorizer: CountVectorizer, knn: NearestNeighbors, vectorized_questions: csr_matrix) -> float:
    """
    Calculate performance of finding similar questions

    :param vectorizer: CountVectorizer
        term frequency vectorizer
    :param knn: NearestNeighbors
        K-nearest neighbors
    :param vectorized_questions: csr_matrix
        input questions transformed with count vectorizer
    :return:
        score (lesser is better)
    """
    with open("../../data/test_questions_json.json") as json_file:
        json_data = json.load(json_file)

    test_questions = json_data["question"]
    original = json_data["original"]

    test_questions = np.asarray([lemmatize_sentence(test_question) for test_question in test_questions])
    test_questions = vectorizer.transform(test_questions)
    _, indices = knn.kneighbors(test_questions.toarray())

    original = np.asarray([lemmatize_sentence(orig) for orig in original])
    original = vectorizer.transform(original)
    indices_original = np.where((vectorized_questions.toarray() == original.toarray()[:, None]).all(-1))[1]

    rank = np.where(indices == indices_original[:, None])[1]
    penalization = (indices_original.shape[0] - rank.shape[0]) * 2 * N_NEIGHBOURS
    score = (rank.sum() + penalization) / indices_original.shape[0]

    return score

In [159]:
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 1))

questions = np.unique(df.iloc[:, 0].to_numpy())
questions = np.asarray([lemmatize_sentence(question) for question in questions])
vectorized_questions = vectorizer.fit_transform(questions)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions.toarray())

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 76.04%


## Term Frequency with Stemming

In [160]:
def stem_sentence(sentence: str) -> str:
    """
    Stem the input sentence and return processed sentence

    :param sentence: str
        sentence to be stemmed
    :return:
        stemmed sentence
    """
    stemmer = PorterStemmer()  # TODO: try other stemmer types
    tokens = word_tokenize(sentence)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

In [161]:
def check_performance(vectorizer: CountVectorizer, knn: NearestNeighbors, vectorized_questions: csr_matrix) -> float:
    """
    Calculate performance of finding similar questions

    :param vectorizer: CountVectorizer
        term frequency vectorizer
    :param knn: NearestNeighbors
        K-nearest neighbors
    :param vectorized_questions: csr_matrix
        input questions transformed with count vectorizer
    :return:
        score (lesser is better)
    """
    with open("../../data/test_questions_json.json") as json_file:
        json_data = json.load(json_file)

    test_questions = json_data["question"]
    original = json_data["original"]

    test_questions = np.asarray([stem_sentence(test_question) for test_question in test_questions])
    test_questions = vectorizer.transform(test_questions)
    _, indices = knn.kneighbors(test_questions.toarray())

    original = np.asarray([stem_sentence(orig) for orig in original])
    original = vectorizer.transform(original)
    indices_original = np.where((vectorized_questions.toarray() == original.toarray()[:, None]).all(-1))[1]

    rank = np.where(indices == indices_original[:, None])[1]
    penalization = (indices_original.shape[0] - rank.shape[0]) * 2 * N_NEIGHBOURS
    score = (rank.sum() + penalization) / indices_original.shape[0]

    return score

In [162]:
questions = np.unique(df.iloc[:, 0].to_numpy())
questions = np.asarray([stem_sentence(question) for question in questions])
vectorized_questions = vectorizer.fit_transform(questions)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions.toarray())

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 82.58%
