In [7]:
import pandas as pd
import numpy as np
from datasketch import MinHash, MinHashLSH
from itertools import zip_longest
from time import time
from collections import Counter
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords

# import nltk
# nltk.download("stopwords")
# Part 2: Nearest Neighbor Search with Locality Sensitive Hashing (LSH)
def set_to_list(list):
    s=set()
    for item in list:
        s.add(item)
    return s

def preprocess_text(text):
    """
    Preprocess the input text by lowercasing and removing punctuation.
    :param text: The input text string.
    :return: Preprocessed text string.
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w]', '', text)  # Remove punctuation
    return text

def text_to_shingles(text, k=4):
    """
    Convert text into shingles (sets of overlapping word sequences of size k).
    :param text: Input string to be converted into shingles.
    :param k: Shingle size (default is 5).
    :return: A set of k-length shingles.
    """
    text = preprocess_text(text)  # Preprocess the text
    # word_list = text.split()
    # shingles = zip_longest(*[words[i:] for i in range(k)], fillvalue='')
    # Filter out incomplete shingles containing the fillvalue
    # text= re.sub(r'[,;.:?!@*&#-_|\s]','',text)
    shingles = []
    for i in range(0, len(text) - k):
        shingles.append(text[i:i + k])
    return set(shingles)
    # return [word for word in word_list if word not in stopwords.words('english')]
    # return {shingle for shingle in shingles if '' not in shingle}

def create_minhash_vectorized(text_series, num_perm):
    """
    Create MinHash objects for a series of texts using shingles.
    :param text_series: Pandas Series of input strings to hash.
    :param num_perm: Number of permutations for MinHash.
    :return: List of MinHash objects.
    """
    minhash_list = []
    for text in text_series:
        m = MinHash(num_perm=num_perm)
        for shingle in text_to_shingles(text):
            m.update(''.join(shingle).encode('utf-8'))
        minhash_list.append(m)
    return minhash_list

def jaccard_distance(vector1, vector2):
    """
    Compute Jaccard distance between two sparse vectors/matrices.
    :vector1: First sparse vector, containing float numbers.
    :vector2: Second sparse vector, containing float numbers
    :return: Jaccard distance score(1-Jaccard similarity).
    """
    # we make the vectors/matrices dense so that we can work on them
    v1dense=vector1.todense()
    v2dense=vector2.todense()
    
    # we turn them into boolean vectors, in which every value is either True or False
    v1bool=v1dense.astype(bool)
    v2bool=v2dense.astype(bool)

    # The intersection of the two boolean vectors is the result of the logical AND
    intersection = np.logical_and(v1bool,v2bool) 
    
    # The union of the two boolean vectors is the result of the logical OR
    union = np.logical_or(v1bool,v2bool)
    
    # The size of the intersection is the number of all the non-Zero(non-False) items
    intersection_size=np.count_nonzero(intersection)
    # The size of the union is the number of all the non-Zero(non-False) items
    union_size=np.count_nonzero(union)

    return (1- intersection_size / union_size) if union_size > 0 else 0


def jaccard_similarity(doc1: set,doc2:set):
    intersection = doc1.intersection(doc2)
    union = doc1.union(doc2)

    return 1 - len(intersection)/len(union)


def create_minhash(vector, num_perm):
    m = MinHash(num_perm=num_perm)
    # print("Create minhash vector",vector)
    for idx in vector:
        print(idx)
        m.update(str(idx).encode('utf8'))
    return m

def nearest_neighbor_search():
    # Load datasets
    print("Loading the training and the test datasets...")
    train_ds_comma=pd.read_csv("train.csv")
    test_ds_comma=pd.read_csv("test_without_labels.csv")


    fraction=0.01 # the fraction of the datasets, used to create smaller, faster to work on datasets

    # Split the training dataset into a smaller one
    print("Splitting the training dataset to a smaller one ...")
    train_subset, _ = train_test_split(
        train_ds_comma, 
        train_size=fraction, 
        stratify=train_ds_comma['Label'],  # Ensure stratified sampling
        random_state=42 
    )

    train_subset.to_csv('train_subset.csv', index=False)

    # Split the testing dataset into a smaller one
    print("Splitting the testing dataset to a smaller one ...")
    test_subset=test_ds_comma.sample(frac=fraction,random_state=42)
    test_subset.to_csv('test_subset.csv',index=False)

    # the two sets that we are going to work on
    train_subset=pd.read_csv('train_subset.csv')
    test_subset=pd.read_csv('test_subset.csv')  

    train_df = train_subset
    test_df = test_subset

    # Example dataset
    train_docs = train_df['Content'] + train_df['Title']
    test_docs = test_df['Content'] + test_df['Title']

    # Preprocessing of the text data
    print("Performing preprocessing and shingling of the data...")

    print("Shingling the training set...")
    list_of_train_shingle=[]
    for tr_doc in train_docs:
        # print("OG text is:",tr_doc)
        sh_doc=text_to_shingles(tr_doc)
        # print("Shingled  text is:",set(sh_doc))
        list_of_train_shingle.append(sh_doc)

    print("Shingling the test set...")
    list_of_test_shingle=[]
    for te_doc in test_docs:
        # print("OG text is:",tr_doc)
        sh_doc=text_to_shingles(te_doc)
        # print("Shingled  text is:",set(sh_doc))
        list_of_test_shingle.append(sh_doc)

    # for i in range(len(list_of_train_shingle)): 
    #     print(list_of_train_shingle[i])

    # for i in range(len(list_of_test_shingle)): 
    #     print(list_of_test_shingle[i])
    

    # Compute K=7 nearest neighbors for each test document
    K = 7
    nearest_neighbors = []
    
    # Brute-force K-NN
    print("Performing Brute-Force K-NN...")
    start_time = time() # we start the clock
    
    for teindex in range(len(list_of_test_shingle)):
        similarities = []
        
        for traindex in range(len(list_of_train_shingle)):
            sim = jaccard_similarity(list_of_test_shingle[teindex], list_of_train_shingle[traindex])
            similarities.append((traindex, sim)) # we keep indexes of train sets
        
        # print("Similarities are",similarities)
        sorted_sim=sorted(similarities,key=lambda x : x[1])
        # print("Sorted similiraties",sorted_sim)
        top_k_neighbors = [idx for idx, sim in sorted_sim[:K]]
        
        nearest_neighbors.append(top_k_neighbors)
    
    brute_force_time = time() - start_time # we stop the clock

    # Print the K=7 nearest neighbors for each test document
    # for test_doc, neighbors in enumerate(nearest_neighbors):
    #     print(f"Test Document {test_doc}: Nearest Neighbors (Train Indexes): {neighbors}")



    print(f"Brute-Force K-NN completed in {brute_force_time:.2f} seconds.")

        # test_df['Predicted_BruteForce'] = brute_force_predictions
        # test_df[['Id', 'Predicted_BruteForce']].to_csv('brute_force_knn_predictions.csv', index=False)

    return
    # LSH-based K-NN
    print("Performing LSH-based K-NN...")
    for num_perm in [16,32,64]:
        start_time = time()
        lsh = MinHashLSH(threshold=0.1, num_perm=num_perm)
        train_minhashes = [create_minhash(X_train[i], num_perm) for i in range(X_train.shape[0])]
        
        for i, minhash in enumerate(train_minhashes):
            lsh.insert(str(i), minhash)
        
        build_time = time() - start_time
        start_time = time()
        lsh_results = []
        
        for i in range(X_test.shape[0]):
            test_minhash = create_minhash(X_test[i], num_perm)
            candidates = lsh.query(test_minhash)
            #print(candidates)
            lsh_results.append([int(c) for c in candidates])
        
        query_time = time() - start_time
        
        matched_fractions = []
        for brute_neighbors, lsh_neighbors in zip(nearest_neighbors, lsh_results):
            matched_count = len(set(brute_neighbors) & set(lsh_neighbors))
            matched_fraction = matched_count / len(brute_neighbors) if brute_neighbors else 0
            matched_fractions.append(matched_fraction)
        
        average_fraction = np.mean(matched_fractions) if matched_fractions else 0
        
        print(f"LSH Results (num_perm={num_perm}):")
        print(f"  Build Time: {build_time:.2f} seconds")
        print(f"  Query Time: {query_time:.2f} seconds")
        print(f"  Fraction Matched: {average_fraction:.2f}")
if __name__ == "__main__":
    nearest_neighbor_search()


Loading the training and the test datasets...
Splitting the training dataset to a smaller one ...
Splitting the testing dataset to a smaller one ...
Performing preprocessing and shingling of the data...
Shingling the training set...
Shingling the test set...
Performing Brute-Force K-NN...
Brute-Force K-NN completed in 71.98 seconds.
