# Term Frequency

## Term Frequency with different distance metrics

In [1]:
import json
from collections.abc import Iterable

import nltk
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

N_NEIGHBOURS = 100

In [2]:
def preprocess_documents(
    documents: Iterable[str], preprocessing: str, stemmer: str = "porter"
) -> Iterable[str]:
    """
    Applies preprocessing to iterable of documents

    :param documents: Iterable[str]
        iterable of documents
    :param preprocessing: str
        represents type of preprocessing applied to documents
    :param stemmer: str
        stemmer type
    :return:
        processed iterable of documents
    """
    if preprocessing == "stemming":
        documents = np.asarray(
            [stem_document(document, stemmer) for document in documents]
        )
    elif preprocessing == "lemmatizing":
        documents = np.asarray([lemmatize_document(document) for document in documents])

    return documents

In [3]:
def check_performance(
    vectorizer: CountVectorizer,
    knn: NearestNeighbors,
    vectorized_questions: np.ndarray,
    preprocessing: str = None,
    stemmer: str = "porter",
) -> float:
    """
    Calculate performance of finding similar questions

    :param vectorizer: CountVectorizer
        term frequency vectorizer
    :param knn: NearestNeighbors
        K-nearest neighbors
    :param vectorized_questions: np.ndarray
        input questions transformed with count vectorizer
    :param preprocessing: str
        represents type of preprocessing applied to documents
    :param stemmer: str
        stemmer type
    :return:
        score (lesser is better)
    """
    with open("../../../data/test_questions_json.json") as json_file:
        json_data = json.load(json_file)

    test_questions = json_data["question"]
    original = json_data["original"]

    test_questions = preprocess_documents(test_questions, preprocessing, stemmer)
    test_questions = vectorizer.transform(test_questions)
    _, indices = knn.kneighbors(test_questions.toarray())

    original = preprocess_documents(original, preprocessing, stemmer)
    original = vectorizer.transform(original)

    original = list(map(set, vectorizer.inverse_transform(original)))
    vectorized_questions = list(
        map(set, vectorizer.inverse_transform(vectorized_questions))
    )

    indices_original = np.asarray([vectorized_questions.index(o) for o in original])

    rank = np.where(indices == indices_original[:, None])[1]
    penalization = (indices_original.shape[0] - rank.shape[0]) * 2 * knn.n_neighbors
    score = (rank.sum() + penalization) / indices_original.shape[0]

    return score

In [4]:
df = pd.read_csv("../../../data/traveling_qna_dataset.csv", sep="\t")
df.drop(columns=df.columns[0], axis=1, inplace=True)

vectorizer = CountVectorizer(lowercase=True)
questions = df.iloc[:, 0].to_numpy()
vectorized_questions = vectorizer.fit_transform(questions)
vectorized_questions = np.unique(vectorized_questions.toarray(), axis=0)

### Euclidean metric

In [5]:
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="euclidean").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 92.43%


### Manhattan (cityblock) metric

In [6]:
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cityblock").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 92.19%


### Cosine metric

In [7]:
knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 96.45%


## Comparison between different numbers of neighbours

In [9]:
N_NEIGHBOURS_GRID = (1, 5, 10, 25, 50, 100, 150, 200, 300, 400, 500, 1000, 2000, 3000)
for N_NEIGHBOURS in N_NEIGHBOURS_GRID:
    knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

    score = check_performance(vectorizer, knn, vectorized_questions)
    print(f"Number of neighbours: {N_NEIGHBOURS} | Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")
N_NEIGHBOURS = 100

Number of neighbours: 1 | Score: 93.33%
Number of neighbours: 5 | Score: 94.83%
Number of neighbours: 10 | Score: 94.92%
Number of neighbours: 25 | Score: 94.97%
Number of neighbours: 50 | Score: 96.20%
Number of neighbours: 100 | Score: 96.45%
Number of neighbours: 150 | Score: 97.36%
Number of neighbours: 200 | Score: 97.59%
Number of neighbours: 300 | Score: 97.83%
Number of neighbours: 400 | Score: 98.90%
Number of neighbours: 500 | Score: 99.15%
Number of neighbours: 1000 | Score: 99.57%
Number of neighbours: 2000 | Score: 99.79%
Number of neighbours: 3000 | Score: 99.86%


## Term Frequency with Stop-Words

In [10]:
vectorizer = CountVectorizer(lowercase=True, stop_words="english")

questions = df.iloc[:, 0].to_numpy()
vectorized_questions = vectorizer.fit_transform(questions)
vectorized_questions = np.unique(vectorized_questions.toarray(), axis=0)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 96.67%


## Term Frequency with N-grams

In [11]:
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 2), max_features=10000)

questions = df.iloc[:, 0].to_numpy()
vectorized_questions = vectorizer.fit_transform(questions)
vectorized_questions = np.unique(vectorized_questions.toarray(), axis=0)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions)
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 96.27%


## Term Frequency with Lemmatization

In [12]:
def lemmatize_document(document: str) -> str:
    """
    Lemmatize the input document and return processed document

    :param document: str
        document to be lemmatized
    :return:
        lemmatized document
    """
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(document)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)

In [13]:
vectorizer = CountVectorizer(lowercase=True)

questions = df.iloc[:, 0].to_numpy()
questions = preprocess_documents(questions, "lemmatizing")
vectorized_questions = vectorizer.fit_transform(questions)
vectorized_questions = np.unique(vectorized_questions.toarray(), axis=0)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions, "lemmatizing")
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 96.42%


## Term Frequency with Stemming

In [14]:
def stem_document(document: str, stemmer: str = "porter") -> str:
    """
    Stem the input document and return processed document

    :param document: str
        document to be stemmed
    :param stemmer: str
        stemmer type
    :return:
        stemmed document
    """
    if stemmer == "porter":
        stemmer = PorterStemmer()
    elif stemmer == "snowball":
        stemmer = SnowballStemmer("english")
    elif stemmer == "lancaster":
        stemmer = LancasterStemmer()
    else:
        raise ValueError(
            f"Stemmer type '{stemmer}' is not supported. Try with 'porter', 'snowball' or 'lancaster'."
        )
    tokens = word_tokenize(document)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

### Porter stemmer

In [15]:
questions = df.iloc[:, 0].to_numpy()
questions = preprocess_documents(questions, "stemming", "porter")
vectorized_questions = vectorizer.fit_transform(questions)
vectorized_questions = np.unique(vectorized_questions.toarray(), axis=0)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions, "stemming", "porter")
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 97.78%


### Snowball stemmer

In [16]:
questions = df.iloc[:, 0].to_numpy()
questions = preprocess_documents(questions, "stemming", "snowball")
vectorized_questions = vectorizer.fit_transform(questions)
vectorized_questions = np.unique(vectorized_questions.toarray(), axis=0)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions, "stemming", "snowball")
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 97.78%


### Lancaster stemmer

In [17]:
questions = df.iloc[:, 0].to_numpy()
questions = preprocess_documents(questions, "stemming", "lancaster")
vectorized_questions = vectorizer.fit_transform(questions)
vectorized_questions = np.unique(vectorized_questions.toarray(), axis=0)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions, "stemming", "lancaster")
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 97.81%


## Term Frequency with Stemming and Stop-Words

In [18]:
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 1), stop_words="english")

questions = df.iloc[:, 0].to_numpy()
questions = preprocess_documents(questions, "stemming", "snowball")
vectorized_questions = vectorizer.fit_transform(questions)
vectorized_questions = np.unique(vectorized_questions.toarray(), axis=0)

knn = NearestNeighbors(n_neighbors=N_NEIGHBOURS, metric="cosine").fit(vectorized_questions)

score = check_performance(vectorizer, knn, vectorized_questions, "stemming", "snowball")
print(f"Score: {100 - score / (2 * N_NEIGHBOURS) * 100:.2f}%")

Score: 98.31%
