In [1]:
! pip install swifter numpy pandas datasketch scikit-learn matplotlib

You should consider upgrading via the '/home/vangelis/.virtualenvs/big-data-analytics/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import time
import swifter
import pandas as pd
import numpy as np

from collections import defaultdict
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import jaccard_distance
from datasketch import MinHashLSH, MinHash

THRESHOLD = 0.8

# Read data
train_df = pd.read_csv('../assist_material/datasets/extracted/datasets2020/datasets/q2a/corpusTrain.csv', sep=',')
train_df.columns = ['id', 'content']

test_df = pd.read_csv('../assist_material/datasets/extracted/datasets2020/datasets/q2a/corpusTest.csv', sep=',')
test_df.columns = ['id', 'content']

# Preprocess
def preprocess(df):
    """
    Cleans up a given dataframe
    :param df: The dataframe to clean
    :return: The cleaned dataframe
    """

    _df = df.copy(deep=True)
    _df['content'] = _df['content'].str.lower()
    _df['content'] = _df['content'].swifter.apply(lambda row: row.encode('ascii',errors='ignore').decode())
    _df = _df[_df['content'].str.split().str.len().gt(2)]
    _df['content'] = _df['content'].str.replace(r'[\n\'\"?\.,:\(\)]', '', regex=True)
    _df.reset_index(drop=True, inplace=True)
    return _df

train_df = preprocess(train_df)
test_df = preprocess(test_df)

# Vectorize texts
vectorizer = TfidfVectorizer()

train_tfidf = vectorizer.fit_transform(train_df['content'])


# Exact Cosine
# cosine_query_start_time = time.time()
# cosine_duplicates = 0


# for query in test_df.itertuples(index=False):
#     query_tfidf = vectorizer.transform([query.content])

#     similarities = cosine_similarity(query_tfidf, train_tfidf).flatten()
#     if np.any(similarities > THRESHOLD):
#         cosine_duplicates += 1

#     indexes = np.where(similarities > THRESHOLD)
#     print(f"Query: {query}")
#     for index in indexes:
#         print(train_df['content'][index])

# cosine_query_end_time = time.time()
# print(f'Total duplicates found: {cosine_duplicates}')
# print(f'Cosine Similarity query time {(cosine_query_end_time - cosine_query_start_time):.2f} seconds')


# Exact Jaccard
def jaccard_similarity(query, document):
    """
    This method estimates the Jaccard similarity between two documents

    :param query: The first document
    :param document: The second document
    :return: The similarity between the two documents
    """
    query_list = query.split()
    document_list = document.split()
    s1, s2 = set(list1), set(list2)
    return len(s1 & s2) / len(s1 | s2)

jaccard_query_start_time = time.time()
jaccard_duplicates = 0

for query in test_df.itertuples(index=False):
    similarities = train_df['content'].swifter.progress_bar(False).apply(
        lambda row: jaccard_similarity(row, query.content))
    if np.any(similarities > THRESHOLD):
        jaccard_duplicates += 1

    indexes = np.where(similarities > THRESHOLD)
    print(f'Query: {query}')
    for index in indexes:
        print(train_df['content'][index])


jaccard_query_end_time = time.time()
print(f'Total duplicates found: {jaccard_duplicates}')
print(f'Jaccard Similarity query time {(jaccard_query_end_time - jaccard_query_start_time):.2f} seconds')


# Random Projection LSH with Cosine Similarity

def random_vectors_generator(dimension, n_vectors):
    """
    Generates a collection of random vectors from the standard Gaussian distribution.
    :param dimension: The dimension of the vector
    :param n_vectors: The number of vectors
    :return: Array with random vectors
    """
    return np.random.randn(dimension, n_vectors)


def train_lsh(X_tfidf, n_vectors):
    """
    Method that creates a LSH model given the TFIDF vector to train
    :param X_tfidf: The TFIDF vector to train the model
    :param n_vectors: The number of random vectors to generate
    :return: The model
    """
    np.random.seed(0)
    dim = X_tfidf.shape[1]
    random_vectors = random_vectors_generator(dim, n_vectors)

    # Partition data points into bins and encode bin index bits into integers
    bin_indices_bits = X_tfidf.dot(random_vectors) >= 0
    # Compute the dot product between the document vector and the vector consisting of powers of 2
    # x << y is the same as multiplying x by 2 ** y
    powers_of_two = 1 << np.arange(n_vectors - 1, -1, step=-1)
    # Final integer representation of individual bins
    bin_indices = bin_indices_bits.dot(powers_of_two)

    # Update `table` so that `table[i]` is the list of document ids with bin index equal to i
    table = defaultdict(list)
    for idx, bin_index in enumerate(bin_indices):
        table[bin_index].append(idx)

    model = {'table': table,
             'random_vectors': random_vectors,
             'bin_indices': bin_indices,
             'bin_indices_bits': bin_indices_bits}
    return model


def search_nearby_bins(query_bin_bits, table, search_radius, candidate_set):
    """
    For a given query vector and trained LSH model's table
    return all candidate neighbors with the specified search radius.

    Example
    -------
    model = train_lsh(X_tfidf, n_vectors=16)
    query = model['bin_index_bits'][0]  # vector for the first document
    candidates = search_nearby_bins(query, model['table'])
    :param query_bin_bits: The binary representation of the query document
    :param table: The trained hash table
    :param search_radius: The nearby bins to search
    :param candidate_set: The set that holds the candidate neighbours
    :return: The set that holds the candidate neighbours
    """
    if candidate_set is None:
        candidate_set = set()

    n_vectors = query_bin_bits.shape[0]
    powers_of_two = 1 << np.arange(n_vectors - 1, -1, step=-1)

    for different_bits in combinations(range(n_vectors), search_radius):
        # Flip the bits (n_1, n_2, ..., n_r) of the query bin to produce a new bit vector
        index = list(different_bits)
        alternate_bits = query_bin_bits.copy()
        alternate_bits[index] = np.logical_not(alternate_bits[index])

        # Convert the new bit vector to an integer index
        nearby_bin = alternate_bits.dot(powers_of_two)

        # fetch the list of documents belonging to
        # the bin indexed by the new bit vector,
        # then add those documents to candidate_set;
        # make sure that the bin exists in the table
        if nearby_bin in table:
            candidate_set.update(table[nearby_bin])

    return candidate_set


def get_nearest_neighbors(X_tfidf, query_vector, model, max_search_radius=3):
    """
    Business method that returns the approximate nearest neighbors of a given document. Here it is calculated the bit
    index of the document to search and using the methods above we extract the neighbors along with the cosine
    similarity with descending order.
    :param X_tfidf: The train TFIDF
    :param query_vector: The document to search TFIDF
    :param model: The LSH model
    :param max_search_radius: The nearby bins to search
    :return: List with nearest neighbours with cosine similarities
    """
    table = model['table']
    random_vectors = model['random_vectors']

    # Compute bin index for the query vector, in bit representation.
    bin_index_bits = np.ravel(query_vector.dot(random_vectors) >= 0)

    # Search nearby bins and collect candidates
    candidate_set = set()
    for search_radius in range(max_search_radius + 1):
        candidate_set = search_nearby_bins(bin_index_bits, table, search_radius, candidate_set)

    # Sort candidates by their true distances from the query
    candidate_list = list(candidate_set)
    candidates = X_tfidf[candidate_list]
    similarities = cosine_similarity(candidates, query_vector).flatten()

    similarities_col = 'similarities'
    nearest_neighbors = pd.DataFrame({
        'id': candidate_list, similarities_col: similarities
    }).sort_values(similarities_col, ascending=False).reset_index(drop=True)
    return nearest_neighbors


# lsh_cosine_build_start_time = time.time()
# lsh_model = train_lsh(train_tfidf, 16)
# lsh_cosine_build_end_time = time.time()
#
# lsh_query_times = []
# lsh_duplicates = []
# for k in range (1, 10 + 1):
#     lsh_cosine_query_start_time = time.time()
#     duplicates = 0
#     for query in test_df.itertuples(index=False):
#         query_tfidf = vectorizer.transform([query.content])
#         nearest_neighbors = get_nearest_neighbors(train_tfidf, query_tfidf, lsh_model, max_search_radius=k)
#         # If the max value is above 0.8 consider it duplicate
#         if nearest_neighbors['similarities'][0] > THRESHOLD:
#             duplicates += 1
#
#         indexes = nearest_neighbors[nearest_neighbors['similarities'] > THRESHOLD]
#         print(f'Query: {query}')
#         for index in indexes.itertuples(index=False):
#             print(train_df['content'][index.id])
#
#     lsh_cosine_query_end_time = time.time()
#     lsh_query_times.append(lsh_cosine_query_end_time - lsh_cosine_query_start_time)
#     print(f'Total duplicates found: {duplicates} for k = {k}')
#     print(f'LSH projection Cosine Similarity query time {(lsh_cosine_query_end_time - lsh_cosine_query_start_time):.2f} '
#           f'seconds')

# Min-Hash LSH Jaccard with Similarity

def train_min_lsh(X, n_permutations):
    """
    Method that creates a Min-LSH model given the training dataframe
    :param X: The train dataframe
    :param n_permutations: The number of permutations to apply
    :return: The model
    """
    model = MinHashLSH(threshold=THRESHOLD, num_perm=n_permutations)
    for entry in X.itertuples(index=False):
        min_hash = MinHash(num_perm=n_permutations)
        set_text = set(entry.content.split())
        for d in set_text:
            min_hash.update(d.encode('utf8'))

        model.insert(entry.id, min_hash)

    return model


# lsh_model = train_min_lsh(train_df, 16)

# for query in test_df.itertuples(index=False):
#     min_hash = MinHash(num_perm=16)
#     set_text = set(query.content.split())
#     for d in set_text:
#         min_hash.update(d.encode('utf8'))
#     result = lsh_model.query(min_hash)
#     print('Query', query)
#     for index in result:
#         print(train_df.loc[train_df['id'] == index])

Pandas Apply:   0%|          | 0/531990 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/5374 [00:00<?, ?it/s]

Query: Pandas(id=0, content='how many people are going towards using phones to search for local businesses')
Series([], Name: content, dtype: object)
Query: Pandas(id=1, content='can an android app use sms only to communicate questions to an intelligence engine without wifi and get back info')
Series([], Name: content, dtype: object)
Query: Pandas(id=2, content='what small detail from an indian movie do you love')
Series([], Name: content, dtype: object)
Query: Pandas(id=3, content='why can not hindu women be the soldier of hinduism why cant she give birth to a hindu kid even if she marries a non-hindu')
Series([], Name: content, dtype: object)
Query: Pandas(id=4, content='how would you write out twelve lakh twelve thousand twelve hundred and twelve numerically')
Series([], Name: content, dtype: object)
Query: Pandas(id=5, content='what are the rto formalities to transfer a car from pune mh 12 passing to bangalore')
Series([], Name: content, dtype: object)
Query: Pandas(id=6, content='

KeyboardInterrupt: 