In [72]:
import pandas as pd
import time
import random
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import math

In [73]:
df = pd.read_excel("Polarity.xlsx")

# Data Processing

## Stop Words Removal

In [74]:
nltk.download('stopwords')

stop_words = stopwords.words('english')
stop_words.append('nan')
stop_words.append('removed')
stop_words.remove('not')
stop_words.remove('no')
stop_words[:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ragne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [75]:
df['Data'] = df['Data'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

## Tokenise Words & Put into Matrix



In [76]:
# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Preparing the data to fit_tranformation
listofData  = list(df["Data"].array)

# Create the Matrix
matrix = vectorizer.fit_transform(listofData)
matrix = matrix.toarray()

# Add into new column in df
df["Vector"] = [row.tolist() for row in matrix]

## LSH **Calculations**


In [77]:
# k = 20 #TODO: find out optimal value
# vector_dimensions = len(df['Vector'][0])
# non_zero_values = len(df['Vector'][0]) #TODO: find out optimal value

def generate_hash_function(k,vector_dimensions,non_zero_values):
    hash_functions_list = []

    for i in range(k):

        hash_functions = np.zeros((1,vector_dimensions))[0]
        selected_indexes = random.sample(range(len(hash_functions)),non_zero_values)

        for index in selected_indexes:
            random_vector = np.random.randn(1)
            hash_functions[index] = random_vector

        hash_functions_list.append(hash_functions)
    
    return hash_functions_list

# random_hash_functions = generate_hash_function(k,vector_dimensions,non_zero_values)

In [78]:
# Perform dot product and get binary vectors
def generate_binary_vectors(input_vector,random_hash_functions):
  binary_vector = []
  for i in range(len(random_hash_functions)):
    dot_product = np.dot(input_vector,random_hash_functions[i])
    # result = 0 if dot_product <= 0 else 1
    result = np.where(dot_product <= 0, 0, 1)
    binary_vector.append(result)
  return binary_vector

# Make a new column called Hashed for all the binary vectors
# df['Hashed'] = df['Vector'].apply( lambda x : one_hashing_function(x,random_hash_functions))


In [79]:
#Calculate K Value Time Comparison
# k_time_dict = {}
# vector_dimensions = len(df['Vector'][0])
# non_zero_values = len(df['Vector'][0]) #TODO: find out optimal value

# for k in range(10,100,10):
#     start = time.time()

#     random_hash_functions = generate_hash_function(k,vector_dimensions,non_zero_values)
#     binary_vectors_list = df['Vector'].apply( lambda x : one_hashing_function(x,random_hash_functions))

#     end = time.time()
#     results_time = end - start
#     k_time_dict[k] = results_time
#     k_df = pd.DataFrame(list(k_time_dict.items()), columns=['K Value', 'Time'])
#     k_df.to_csv(f'LSH Search Results/K_Value_Time.csv', index=False)


In [80]:
# Perform linear search on binary vector using hamming distance Dh
def hamming_distance(query_vector,database_sample):
  mismatched_bits = 0
  for i in range(len(query_vector)):
    if query_vector[i] != database_sample[i]:
      mismatched_bits += 1
  return mismatched_bits

# query_vector = df.loc[0]['Hashed']
# # print(query_vector)

# mismatched_list = []
# for i, row in df.iterrows():
#   mismatched_list.append(hamming_distance(query_vector,row['Hashed']))

# df['Distance'] = mismatched_list

In [81]:
# Calculate 10 different queries accuracies
import os

query_dict = {'Q1': 'Looking for newest Fragrant Dior perfume ', 
              'Q2': 'Rolex submariner on sale', 
              'Q3': 'Hottest Gucci trends', 
              'Q4': 'Louis Vuitton black bag', 
              'Q5': 'Polo ralph lauren for fathers day', 
              'Q6': 'Tiffany And Co Bracelet',
              'Q7': 'Balenciaga latest products',
              'Q8': 'Chanel pink bag',
              'Q9': 'Prada leather handbags',
              'Q10': 'London gucci shoes'}

query_df = pd.DataFrame.from_dict(query_dict, orient='index', columns=['Query Text'])

# Combine the data from the DataFrame and query_dict into a single list
all_data = list(df["Data"].array) + list(query_dict.values())

matrix = vectorizer.fit_transform(all_data)
matrix = matrix.toarray()

# Separate the vectors for the DataFrame and query_dict
df_vectors = matrix[:len(df)]
query_dict_vectors = matrix[len(df):]

df["Vector"] = [row.tolist() for row in df_vectors]

# Variables for LSH
k = 20
vector_dimensions = len(df['Vector'][0])
non_zero_values = len(df['Vector'][0])
random_hash_functions = generate_hash_function(k,vector_dimensions,non_zero_values)

# # Do LSH Calculations for database

df['Binary Vectors'] = df['Vector'].apply( lambda x : generate_binary_vectors(x,random_hash_functions))

# Do LSH Calculations for query vectors
query_dict_binary_vectors = [generate_binary_vectors(vector, random_hash_functions) for vector in query_dict_vectors]

query_df['Vectors'] = [row.tolist() for row in query_dict_vectors]
query_df['Binary Vectors'] = query_dict_binary_vectors

  hash_functions[index] = random_vector


In [83]:
# Specify the folder path
folder_path = 'LSH Search Results'

# Define the Excel file path
excel_file_path = os.path.join(folder_path, 'LSH_Search_Results.xlsx')

with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer:     
    
    for query_name,query_row in query_df.iterrows():
        # print('i',query_name)
        # print('row',query_row['Binary Vectors'])

        mismatched_list = []

        for i, row in df.iterrows():
            mismatched_list.append(hamming_distance(query_row['Binary Vectors'],row['Binary Vectors'])) 
        
        df['Distance'] = mismatched_list

        # Get only data and distance from query_name
        query_result_df = df[['Data', 'Distance']].copy()

        # Get the top 50 results based on the 'Distance' column
        query_result_top = query_result_df.nsmallest(50, 'Distance') 
    
        # Loop through each unique query and create separate sheets
        query_result_top.to_excel(writer, sheet_name=f'Results_{query_name}', index=False)