In [10]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datasketch import MinHash, MinHashLSH
from time import time
from itertools import zip_longest
from collections import Counter



# def text_to_shingles(text, k=5):
#     text=text.lower()
#     text= re.sub(r'[,;.:?!@*&#-_|\s]','',text)
#     shingles = []
#     for i in range(0, len(text) - k):
#         shingles.append(text[i:i + k])
#     return set(shingles)


# def create_minhash(text, num_perm):
#     m = MinHash(num_perm=num_perm)
#     for shingle in text_to_shingles(text):
#         m.update(shingle.encode('utf-8'))
#     return m

# def text_to_shingles(text,k=2):
#     words=text.lower()
#     words=re.split(r'[,.\s!"?;_*@)(]',words)

#     # shingles=set()
#     # for i in range(0,len(words) - k):
#     #     if '' not in words[i:i + k]:
#     #         shingles.add(' '.join(words[i:i + k]))

#     return set(words)

# group C
def text_to_shingles(text, k=4):
    text=text.lower()
    words = re.split(r'[,.\s!"?;_*@]',text)
    shingles = set(zip_longest(*[words[i:] for i in range(k)], fillvalue=''))
    # Filter out incomplete shingles containing the fillvalue
    return set(shingle for shingle in shingles if '' not in shingle)


def jaccard_similarity(set1, set2):
    return len(set1 & set2) / len(set1 | set2)

# group C
# Create MinHash signatures
def create_minhash(text, num_perm):
    m = MinHash(num_perm=num_perm)
    for shingle in text_to_shingles(text):
        m.update(''.join(shingle).encode('utf-8'))
    return m

# Part 2: Nearest Neighbor Search with LSH
def nearest_neighbor_search():
# we read the train and the test datasets from the respective files
    train_ds_comma=pd.read_csv("train.csv")
    test_ds_comma=pd.read_csv("test_without_labels.csv")


    fraction=0.05 # the fraction of the datasets, used to create smaller, faster to work on datasets

# Split the training dataset into a smaller one
    train_subset, _ = train_test_split(
        train_ds_comma, 
        train_size=fraction, 
        stratify=train_ds_comma['Label'],  # Ensure stratified sampling
        random_state=42 
    )

    train_subset.to_csv('train_subset.csv', index=False)

# Split the testing dataset into a smaller one
    test_subset=test_ds_comma.sample(frac=fraction,random_state=42)
    test_subset.to_csv('test_subset.csv',index=False)

# the two sets that we are going to work on
    train_subset=pd.read_csv('train_subset.csv')
    test_subset=pd.read_csv('test_subset.csv')  

    train_df = train_subset
    test_df = test_subset

    # train_df = train_ds_comma
    # test_df = test_ds_comma

    train_shingles = (train_df['Title']+train_df['Content']).apply(text_to_shingles)
    print(train_shingles)
    test_shingles = (test_df['Title']+test_df['Content']).apply(text_to_shingles)

    # Brute-force K-NN
    print("Performing Brute-Force K-NN...")
    start_time = time()
    k = 7  # Number of nearest neighbors
    bf_results = []
    bf_predictions = []

    for test_doc in test_shingles:
        # Calculate Jaccard similarity with all training documents
        # print(test_doc)
        similarities = [(i, jaccard_similarity(test_doc, train_doc)) for i, train_doc in enumerate(train_shingles)]
        similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]  # Get top-k
        print(similarities)
        # Save the indices of nearest neighbors
        bf_results.append([idx for idx, _ in similarities])

        # Perform majority voting for classification
        neighbor_labels = [train_df.iloc[idx]['Label'] for idx, _ in similarities]
        majority_label = Counter(neighbor_labels).most_common(1)[0][0]
        bf_predictions.append(majority_label)

    brute_force_time = time() - start_time
    print(f"Brute-Force K-NN completed in {brute_force_time:.2f} seconds.")

    test_df['Predicted_BruteForce'] = bf_predictions
    test_df[['Id', 'Predicted_BruteForce']].to_csv('brute_force_knn_predictions.csv', index=False)

    # LSH-based K-NN
    print("Performing LSH-based K-NN...")
    for perm in [16, 32, 64]:  # Different configurations of permutations
        start_time = time()

        b = int(8 * perm/16)
        r = int(perm / b)

        # Initialize LSH with a threshold
        lsh = MinHashLSH(threshold=0.2, num_perm=perm,params=(b,r))


        # Insert training documents into the LSH index
        
        tr_docs=enumerate(train_df['Title']+train_df['Content'])
        for i, text in tr_docs:
            minhash_train = create_minhash(text, perm)
            lsh.insert(i, minhash_train)


        build_time = time() - start_time

        start_time = time()
        lsh_results = []
        lsh_predictions = []

        te_docs=enumerate(test_df['Title']+test_df['Content'])
        for test_idx, test_text in te_docs:
            # Create MinHash for the test document
            minhash_test = create_minhash(test_text, perm)

            # print(f"MinHash Signature for Test Doc {test_idx}: {minhash_test.hashvalues[:5]}")
            # Query LSH for candidates
            candidates = lsh.query(minhash_test)

            # print(f"Test Doc {test_idx}: Found {len(candidates)} candidates.")

            if candidates:
                # Compute Jaccard similarity for candidates
                similarities = [(idx, jaccard_similarity(train_shingles[idx], text_to_shingles(test_text))) for idx in candidates]
                similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]
                lsh_results.append([idx for idx, _ in similarities])

                # Perform majority voting for classification
                neighbor_labels = [train_df.iloc[idx]['Label'] for idx, _ in similarities]
                majority_label = Counter(neighbor_labels).most_common(1)[0][0]
                lsh_predictions.append(majority_label)
            else:
                # No candidates found
                lsh_results.append([])
                lsh_predictions.append("Unknown")

        query_time = time() - start_time

        # Calculate fraction of true K-nearest neighbors returned
        matched_fractions = []
        for brute_force_neighbors, lsh_neighbors in zip(bf_results, lsh_results):
            if brute_force_neighbors:
                matched_count = len(set(brute_force_neighbors) & set(lsh_neighbors))
                matched_fraction = matched_count / len(brute_force_neighbors)
                matched_fractions.append(matched_fraction)
        average_fraction = np.mean(matched_fractions) if matched_fractions else 0

        print(f"LSH Results (num_perm={perm}):")
        print(f"  Build Time: {build_time:.2f} seconds")
        print(f"  Query Time: {query_time:.2f} seconds")
        print(f"  Fraction Matched: {average_fraction:.2f}")

        # Save LSH predictions to file
        test_df[f'Predicted_LSH_{perm}'] = lsh_predictions
        test_df[['Id', f'Predicted_LSH_{perm}']].to_csv(f'lsh_knn_predictions_{perm}.csv', index=False)


# # Part 3: Dynamic Time Warping
# def dtw_distance(seq_a, seq_b):
#     n, m = len(seq_a), len(seq_b)
#     dtw_matrix = np.full((n + 1, m + 1), float('inf'))
#     dtw_matrix[0, 0] = 0

#     for i in range(1, n + 1):
#         for j in range(1, m + 1):
#             cost = abs(seq_a[i - 1] - seq_b[j - 1])
#             dtw_matrix[i, j] = cost + min(dtw_matrix[i - 1, j], dtw_matrix[i, j - 1], dtw_matrix[i - 1, j - 1])

#     return dtw_matrix[n, m]

# def dynamic_time_warping():
#     data = pd.read_csv('time_series.csv')
#     results = []

#     start_time = time()
#     for idx, row in data.iterrows():
#         seq_a = np.array(eval(row['seq_a']))
#         seq_b = np.array(eval(row['seq_b']))
#         dtw_dist = dtw_distance(seq_a, seq_b)
#         results.append({'id': row['id'], 'DTW distance': dtw_dist})
#     total_time = time() - start_time

#     results_df = pd.DataFrame(results)
#     results_df.to_csv('dtw_test.csv', index=False)
#     print(f"Total DTW Time: {total_time}")

# Execute tasks
if __name__ == "__main__":
    nearest_neighbor_search()
    # dynamic_time_warping()


0       {(dongle, when, it, comes), (streaming, stick,...
1       {(bank, atms, vulnerable, to), (atms, from, po...
2       {(to, make, the, high-tech), (ray-ban, and, oa...
3       {(no, indication, that, the), (westbound, lane...
4       {(black-coloured, dummy, of, the), (by, curved...
                              ...                        
1112    {(month-on-month, change, in, most), (fell, by...
1113    {(comics, space, rise, in), (day, as, the, pri...
1114    {(has, fallen, by, 15), (it, could, create, a)...
1115    {(hard, to, keep, her), (these, old, buildings...
1116    {(because, i, think, she), (one, most-liked, p...
Length: 1117, dtype: object
Performing Brute-Force K-NN...
[(32, 0.0020242914979757085), (660, 0.001053740779768177), (0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0)]
[(134, 0.0038022813688212928), (634, 0.0018115942028985507), (5, 0.0006743088334457181), (0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0)]
[(202, 0.011029411764705883), (228, 0.008771929824561403), (274, 