In [10]:
import pandas as pd
import numpy as np
from datasketch import MinHash, MinHashLSH
from itertools import zip_longest
from time import time
from collections import Counter
from sklearn.model_selection import train_test_split
import re

# Part 2: Nearest Neighbor Search with Locality Sensitive Hashing (LSH)

def preprocess_text(text):
    """
    Preprocess the input text by lowercasing and removing punctuation.
    :param text: The input text string.
    :return: Preprocessed text string.
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

def text_to_shingles(text, k=5):
    """
    Convert text into shingles (sets of overlapping word sequences of size k).
    :param text: Input string to be converted into shingles.
    :param k: Shingle size (default is 5).
    :return: A set of k-length shingles.
    """
    text = preprocess_text(text)  # Preprocess the text
    words = text.split()
    shingles = set(zip_longest(*[words[i:] for i in range(k)], fillvalue=''))
    # Filter out incomplete shingles containing the fillvalue
    return {shingle for shingle in shingles if '' not in shingle}

def create_minhash_vectorized(text_series, num_perm):
    """
    Create MinHash objects for a series of texts using shingles.
    :param text_series: Pandas Series of input strings to hash.
    :param num_perm: Number of permutations for MinHash.
    :return: List of MinHash objects.
    """
    minhash_list = []
    for text in text_series:
        m = MinHash(num_perm=num_perm)
        for shingle in text_to_shingles(text):
            m.update(''.join(shingle).encode('utf-8'))
        minhash_list.append(m)
    return minhash_list

def jaccard_similarity_vectorized(shingles1, shingles2):
    """
    Calculate Jaccard similarity between two lists of sets of shingles.
    :param shingles1: List of sets (first set of shingles).
    :param shingles2: List of sets (second set of shingles).
    :return: Numpy array of Jaccard similarity scores.
    """
    return np.array([
        len(set1 & set2) / len(set1 | set2) if set1 and set2 else 0
        for set1, set2 in zip(shingles1, shingles2)
    ])

def nearest_neighbor_search():
    # Load datasets
    train_ds_comma=pd.read_csv("train.csv")
    test_ds_comma=pd.read_csv("test_without_labels.csv")


    fraction=0.01 # the fraction of the datasets, used to create smaller, faster to work on datasets

# Split the training dataset into a smaller one
    train_subset, _ = train_test_split(
        train_ds_comma, 
        train_size=fraction, 
        stratify=train_ds_comma['Label'],  # Ensure stratified sampling
        random_state=42 
    )

    train_subset.to_csv('train_subset.csv', index=False)

# Split the testing dataset into a smaller one
    test_subset=test_ds_comma.sample(frac=fraction,random_state=42)
    test_subset.to_csv('test_subset.csv',index=False)

# the two sets that we are going to work on
    train_subset=pd.read_csv('train_subset.csv')
    test_subset=pd.read_csv('test_subset.csv')  

    train_df = train_subset
    test_df = test_subset

    # Precompute shingles for training and test datasets
    train_shingles = train_df['Content'].apply(text_to_shingles)
    print(train_shingles[0])
    test_shingles = test_df['Content'].apply(text_to_shingles)

    # Brute-force K-NN
    print("Performing Brute-Force K-NN...")
    start_time = time()
    k = 7  # Number of nearest neighbors

    # Compute pairwise Jaccard similarity
    similarities = np.zeros((len(test_shingles), len(train_shingles)))
    for i, test_doc in enumerate(test_shingles):
        print([test_doc] * len(train_shingles))
        similarities[i, :] = jaccard_similarity_vectorized([test_doc] * len(train_shingles), train_shingles)

    print(similarities)
    # Get top-k nearest neighbors and predictions
    brute_force_results = np.argsort(-similarities, axis=1)[:, :k]
    brute_force_predictions = [
        Counter(train_df.iloc[neighbors]['Label']).most_common(1)[0][0]
        for neighbors in brute_force_results
    ]

    brute_force_time = time() - start_time
    print(f"Brute-Force K-NN completed in {brute_force_time:.2f} seconds.")

    test_df['Predicted_BruteForce'] = brute_force_predictions
    test_df[['Id', 'Predicted_BruteForce']].to_csv('brute_force_knn_predictions.csv', index=False)

    # LSH-based K-NN
    print("Performing LSH-based K-NN...")
    for num_perm in [16, 32, 64]:  # Different configurations of permutations
        start_time = time()

        # Initialize LSH with a threshold
        lsh = MinHashLSH(threshold=0.5, num_perm=num_perm)

        # Insert training documents into the LSH index
        train_minhashes = create_minhash_vectorized(train_df['Content'], num_perm)
        for i, minhash in enumerate(train_minhashes):
            lsh.insert(i, minhash)

        build_time = time() - start_time

        start_time = time()
        lsh_results = []
        lsh_predictions = []

        test_minhashes = create_minhash_vectorized(test_df['Content'], num_perm)
        for test_idx, minhash_test in enumerate(test_minhashes):
            # Query LSH for candidates
            candidates = lsh.query(minhash_test)

            if candidates:
                # Compute Jaccard similarity for candidates
                candidate_shingles = [train_shingles[idx] for idx in candidates]
                candidate_similarities = jaccard_similarity_vectorized([test_shingles[test_idx]] * len(candidates), candidate_shingles)
                top_candidates = np.argsort(-candidate_similarities)[:k]
                top_indices = [candidates[idx] for idx in top_candidates]
                lsh_results.append(top_indices)

                # Perform majority voting for classification
                neighbor_labels = train_df.iloc[top_indices]['Label']
                majority_label = Counter(neighbor_labels).most_common(1)[0][0]
                lsh_predictions.append(majority_label)
            else:
                # No candidates found
                lsh_results.append([])
                lsh_predictions.append("Unknown")

        query_time = time() - start_time

        # Calculate fraction of true K-nearest neighbors returned
        matched_fractions = []
        for brute_force_neighbors, lsh_neighbors in zip(brute_force_results, lsh_results):
            if len(brute_force_neighbors) > 0:
                matched_count = len(set(brute_force_neighbors) & set(lsh_neighbors))
                matched_fraction = matched_count / len(brute_force_neighbors)
                matched_fractions.append(matched_fraction)
        average_fraction = np.mean(matched_fractions) if matched_fractions else 0

        print(f"LSH Results (num_perm={num_perm}):")
        print(f"  Build Time: {build_time:.2f} seconds")
        print(f"  Query Time: {query_time:.2f} seconds")
        print(f"  Fraction Matched: {average_fraction:.2f}")

        # Save LSH predictions to file
        test_df[f'Predicted_LSH_{num_perm}'] = lsh_predictions
        test_df[['Id', f'Predicted_LSH_{num_perm}']].to_csv(f'lsh_knn_predictions_{num_perm}.csv', index=False)

if __name__ == "__main__":
    nearest_neighbor_search()


{('need', 'to', 'know', 'from', 'friday'), ('scandals', 'over', 'the', 'companys', 'ability'), ('this', 'reinforces', 'this', 'mature', 'trend'), ('jcar015', 'after', 'two', 'people', 'in'), ('charles', 'schwab', 'told', 'business', 'insider'), ('to', 'retool', 'the', 'treatment', 'and'), ('to', 'the', 'races', 'again', 'but'), ('i', 'dont', 'think', 'this', 'suggests'), ('economy', 'added', '287000', 'jobs', 'in'), ('its', 'own', 'version', 'of', 'the'), ('the', 'fastest', 'pace', 'since', 'the'), ('not', 'heading', 'into', 'recessiontype', 'conditions'), ('hairs', 'breadth', 'of', 'an', 'alltime'), ('us', 'added', 'more', 'oil', 'rigs'), ('medicare', 'and', 'medicaid', 'services', 'barred'), ('the', 'week', 'the', 'sp', '500'), ('month', 'of', 'june', 'well', 'above'), ('canada', 'was', 'not', 'so', 'lucky'), ('212990', '3200', '153', 'nasdaq', '495676'), ('june', 'well', 'below', 'the', '5000'), ('yield', '13660', '151', 'ftse', '250'), ('seek', 'approval', 'in', '2018', 'the'), ('e