In [None]:
import mmh3 #library for Minhashing
import numpy as np
import pandas as pd
#import nltk
#from nltk.tokenize import word_tokenize
import mmh3   # Fast, non-cryptographic hash
#import math
#nltk.download('punkt')




In [None]:
df = pd.read_csv("/Users/sararedaelli/Desktop/AMAZON REVIEWS/Books_rating_cleaned_minhashing.csv")

In [None]:
# -------- Parameters --------
q = 1                   # number of words per shingle (1 = word-level)
num_hashes = 100        # number of hash functions (adjust for accuracy)
max_hash = 2**32 - 1    # 32-bit hash space (balances memory and collisions)
np.random.seed(42)      # reproducibility

# -------- Hash function family --------
# Each hash function: h_i(x) = (a_i * x + b_i) mod max_hash
a = np.random.randint(1, max_hash, size=num_hashes, dtype=np.uint64)
b = np.random.randint(0, max_hash, size=num_hashes, dtype=np.uint64)

def get_word_shingles(text, q=1):
    """Return q-word shingles from pre-cleaned text using simple split."""
    #Ensure the text is always a string
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    words = text.split()
    if len(words) < q:
        return {" ".join(words)}  # single shingle if too short
    return {" ".join(words[i:i+q]) for i in range(len(words) - q + 1)}

# -------- Hashing utilities --------
 #Turning the word into numbers using a basic hash function: Uses a hash function (mmh3.hash, MurmurHash3) to convert a string (shingle) into a 32-bit number.
def base_hash(shingle):
    return np.uint32(mmh3.hash(shingle, signed=False))

#Creating the signature
def minhash_signature(shingles):
    #Uses base_hash to create the actual vector that corresponds to each review
    hash_values = np.array([base_hash(s) for s in shingles], dtype=np.uint64)
    #we apply a second set of hash functions (polinomial, defined by us) on hash_values, take the min out of each function and put it in a signature with num_hashes colums.
    sig = np.full(num_hashes, np.inf)
    #Computing the value of the hash gunction for each shingle (the columns h(x))
    for i in range(num_hashes):
        combined = (a[i] * hash_values + b[i]) % max_hash
        sig[i] = combined.min()
    return sig.astype(np.uint32)

In [None]:
# Create empty lists to store results
signatures = []
ratings = []

# Loop over each row
for idx, row in df.iterrows():
    review_text = row['cleanText']              # column 3
    rating = row['review/score']                   # column 2
    
    shingles = get_word_shingles(review_text, q=q)  # generate shingles
    sig_vector = minhash_signature(shingles)        # compute MinHash signature
    
    signatures.append(sig_vector)
    ratings.append(rating)


# Combine signatures and ratings into a DataFrame - one column has all the signatures (one review per row) and the other column has the rating of that review
result_df = pd.DataFrame({
    'signature': signatures,  # each entry is a NumPy array
    'rating': ratings
})
result_df['signature'] = result_df['signature'].apply(lambda x: x.tolist())
#result_df.to_csv("reviews_minhash_single_column.csv", index=False)