In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datasketch import MinHash, MinHashLSH
from time import time
from itertools import zip_longest
from collections import Counter


# Part 2: Nearest Neighbor Search with LSH
def nearest_neighbor_search():
    train_df = pd.read_csv('train_set.csv', sep='|')
    test_df = pd.read_csv('test_set.csv', sep='|')

    # Convert text to shingles (sets of words)
    def text_to_shingles(text, k=5):
        words = text.split()
        return set(zip_longest(*[words[i:] for i in range(k)], fillvalue=''))

    # Create MinHash signatures
    def create_minhash(text, num_perm):
        m = MinHash(num_perm=num_perm)
        for shingle in text_to_shingles(text):
            m.update(' '.join(shingle).encode('utf-8'))
        return m

    # Brute-force K-NN using Jaccard similarity
    start_time = time()
    train_shingles = train_df['Content'].apply(text_to_shingles)
    test_shingles = test_df['Content'].apply(text_to_shingles)

    def jaccard_similarity(set1, set2):
        return len(set1 & set2) / len(set1 | set2)

    k = 7  # Number of nearest neighbors
    brute_force_results = []
    brute_force_predictions = []
    for test_doc in test_shingles:
        similarities = [(i, jaccard_similarity(test_doc, train_doc)) for i, train_doc in enumerate(train_shingles)]
        similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]
        brute_force_results.append([idx for idx, _ in similarities])

        # Majority voting for classification
        neighbor_labels = [train_df.iloc[idx]['Label'] for idx, _ in similarities]
        majority_label = Counter(neighbor_labels).most_common(1)[0][0]
        brute_force_predictions.append(majority_label)

    brute_force_time = time() - start_time
    print(f"Brute-Force K-NN Time: {brute_force_time}")

    test_df['Predicted_BruteForce'] = brute_force_predictions
    test_df[['Id', 'Predicted_BruteForce']].to_csv('brute_force_knn_predictions.csv', index=False)

    # LSH with Min-Hashing
    for num_perm in [16, 32, 64]:
        start_time = time()
        lsh = MinHashLSH(threshold=0.9, num_perm=num_perm)

        for i, text in enumerate(train_df['Content']):
            lsh.insert(i, create_minhash(text, num_perm))

        build_time = time() - start_time

        start_time = time()
        lsh_results = []
        lsh_predictions = []
        for test_idx, test_text in enumerate(test_df['Content']):
            minhash_test = create_minhash(test_text, num_perm)
            candidates = lsh.query(minhash_test)
            similarities = [(idx, jaccard_similarity(text_to_shingles(train_df.loc[idx, 'Content']), text_to_shingles(test_text))) for idx in candidates]
            similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]

            if similarities:
                lsh_results.append([idx for idx, _ in similarities])
                neighbor_labels = [train_df.iloc[idx]['Label'] for idx, _ in similarities]
                majority_label = Counter(neighbor_labels).most_common(1)[0][0]
                lsh_predictions.append(majority_label)
            else:
                lsh_results.append([])
                lsh_predictions.append("Unknown")  # Handle cases where no candidates are found

        query_time = time() - start_time

        # Calculate fraction of true K-most similar documents returned
        matched_fractions = []
        for brute_force_neighbors, lsh_neighbors in zip(brute_force_results, lsh_results):
            if brute_force_neighbors:
                matched_count = len(set(brute_force_neighbors) & set(lsh_neighbors))
                matched_fraction = matched_count / len(brute_force_neighbors)
                matched_fractions.append(matched_fraction)
        average_fraction = np.mean(matched_fractions) if matched_fractions else 0

        print(f"LSH Results (Permutations={num_perm}):")
        print(f"Build Time: {build_time}, Query Time: {query_time}, Fraction Matched: {average_fraction:.2f}")

        test_df[f'Predicted_LSH_{num_perm}'] = lsh_predictions
        test_df[["Id", f"Predicted_LSH_{num_perm}"]].to_csv(f'lsh_knn_predictions_{num_perm}.csv', index=False)

# Part 3: Dynamic Time Warping
def dtw_distance(seq_a, seq_b):
    n, m = len(seq_a), len(seq_b)
    dtw_matrix = np.full((n + 1, m + 1), float('inf'))
    dtw_matrix[0, 0] = 0

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = abs(seq_a[i - 1] - seq_b[j - 1])
            dtw_matrix[i, j] = cost + min(dtw_matrix[i - 1, j], dtw_matrix[i, j - 1], dtw_matrix[i - 1, j - 1])

    return dtw_matrix[n, m]

def dynamic_time_warping():
    data = pd.read_csv('time_series.csv')
    results = []

    start_time = time()
    for idx, row in data.iterrows():
        seq_a = np.array(eval(row['seq_a']))
        seq_b = np.array(eval(row['seq_b']))
        dtw_dist = dtw_distance(seq_a, seq_b)
        results.append({'id': row['id'], 'DTW distance': dtw_dist})
    total_time = time() - start_time

    results_df = pd.DataFrame(results)
    results_df.to_csv('dtw.csv', index=False)
    print(f"Total DTW Time: {total_time}")

# Execute tasks
if __name__ == "__main__":
    text_classification()
    nearest_neighbor_search()
    dynamic_time_warping()
