original dataset: Opinosis Opinion ⁄ Review Data Set https://archive.ics.uci.edu/ml/datasets/Opinosis+Opinion+%26frasl%3B+Review

In [48]:
# encoding: utf-8

import numpy as np
import pandas as pd
import string
import itertools
import hashlib
import random

In [49]:
# 8 text files we choose to use
file_list = ['battery-life_amazon_kindle',
            'battery-life_ipod_nano_8gb',
            'location_bestwestern_hotel_sfo',
            'location_holiday_inn_london',
            'price_amazon_kindle',
            'room_holiday_inn_london',
            'rooms_bestwestern_hotel_sfo',
            'screen_ipod_nano_8gb']

In [50]:
# I/O function
def import_dataset(file_name):
    """
    input: file_name in file_list
    """
    
    dataFile = './dataset/' + str(file_name) + '.txt.data'
    with open(dataFile, 'rb') as f:
        contents = []
        for line in f.readlines():
            contents.append(line.strip().decode('utf-8'))
    return contents

In [51]:
# import dataset
print('Importing dataset')

dataset = [] # 8-entry list, each corresponding to a file
for file_name in file_list:
    contents = import_dataset(file_name)
    dataset.append(contents)
    
print('document number:' + str(len(dataset)))

Importing dataset
document number:8


1. A class Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, computes a hash value for each unique shingle, and represents the document in the form of an ordered set of its hashed k-shingles.

In [52]:
# function constructs k-shingles from a given text
def shingling(text, k=5):
    """
    input: text in dataset
            k, shingle size.
    """
    
    # split the string into separate words
    split_text = []
    for review in text:
        exclude = set(string.punctuation)
        review = ''.join(ch for ch in review if ch not in exclude) # remove punctuation
        review = review.lower() # convert all characters into lower characters
        #for word in review.split():
        #    split_text.append(word)
        for character in list(review):
            split_text.append(character)
    
    # k-shingle
    shingle_list = []
    for i in range(len(split_text)-k+1):
        shingle_list.append(split_text[i:i+k])

    # remove duplicates
    shingle_list.sort()
    shingle_no_dup = list(shingle_list for shingle_list,_ in itertools.groupby(shingle_list))
    
    # each sublist in shingle_no_dup represents a shingle
    # convert the sublist into a string
    shingle_strings = []
    for index, shingle in enumerate(shingle_no_dup):
        sum_string = shingle[0]
        for i in range(1, len(shingle)):
            sum_string = sum_string + ' ' + shingle[i]
        shingle_strings.append(sum_string)
        
    return shingle_strings

In [53]:
# shingling all 8 texts
print('Shingling')

shingle_texts = []
for text in dataset:
    shingle_texts.append(shingling(text))
    
for i in range(len(shingle_texts)):
    num = len(shingle_texts[i])
    print(f'number of shingles in document {i+1} is {num}')

Shingling
number of shingles in document 1 is 4838
number of shingles in document 2 is 3228
number of shingles in document 3 is 8507
number of shingles in document 4 is 9910
number of shingles in document 5 is 5360
number of shingles in document 6 is 17206
number of shingles in document 7 is 9323
number of shingles in document 8 is 3247


In [54]:
# flatting the list of all shingles
flat_shingle_texts = np.hstack(np.array(shingle_texts))
print('number of shingles:',flat_shingle_texts.shape[0])

# get unique shingles
unique_flat_shingle_texts = np.unique(flat_shingle_texts)
max_value = unique_flat_shingle_texts.shape[0]
print('number of unique shingles:', max_value)

number of shingles: 61619
number of unique shingles: 30623


In [55]:
# hash unique shingles
# build dictionary with {unique shingle: hash value}
shingle_dict = {}
for i in range(len(unique_flat_shingle_texts)):
    shingle_dict[unique_flat_shingle_texts[i]] = hash(unique_flat_shingle_texts[i])

In [56]:
# represents the document in the form of an ordered set of its hashed k-shingles
shingle_int = shingle_texts
for i in range(len(shingle_int)):
    for j in range(len(shingle_int[i])):
        shingle_int[i][j] = shingle_dict[shingle_texts[i][j]]

2. A class CompareSets that computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.

In [57]:
# compute Jaccard similarity of two sets of hashed shingles
def jaccard_similarity(text1, text2):
    """
    input: text1, text2 are entries in shingle_int. eg: text1 = shingle_int[0]
    """
    set1 = set(text1)
    set2 = set(text2)
    jaccard = len(set1.intersection(set2)) / len(set1.union(set2))
    return jaccard

In [58]:
# test example
i = 0 # doc1 index
j = 1 # doc2 index
jaccard = jaccard_similarity(shingle_int[i],shingle_int[j])

namei = file_list[i] # doc1 name
namej = file_list[j] # doc2 name
print(f"Jaccard similarity for document '{namei}' and '{namej}' is {jaccard}.")

Jaccard similarity for document 'battery-life_amazon_kindle' and 'battery-life_ipod_nano_8gb' is 0.17323636363636363.


In [87]:
# build Jaccard similarity matrix
jaccard_similarity_matrix = np.zeros((len(file_list), len(file_list)))
for i in range(len(file_list)):
    for j in range(len(file_list)):
        jaccard_similarity_matrix[i][j] = jaccard_similarity(
            shingle_int[i],shingle_int[j])

In [88]:
jaccard_similarity_matrix

array([[1.        , 0.17323636, 0.14667469, 0.15643378, 0.19948247,
        0.14878316, 0.15092653, 0.14146548],
       [0.17323636, 1.        , 0.12253683, 0.12492508, 0.14858901,
        0.10843504, 0.12182696, 0.22493379],
       [0.14667469, 0.12253683, 1.        , 0.29051923, 0.16627418,
        0.23146552, 0.29381032, 0.11412322],
       [0.15643378, 0.12492508, 0.29051923, 1.        , 0.1744347 ,
        0.28572783, 0.26566202, 0.12165388],
       [0.19948247, 0.14858901, 0.16627418, 0.1744347 , 1.        ,
        0.16313592, 0.17238901, 0.14378738],
       [0.14878316, 0.10843504, 0.23146552, 0.28572783, 0.16313592,
        1.        , 0.28581815, 0.09991933],
       [0.15092653, 0.12182696, 0.29381032, 0.26566202, 0.17238901,
        0.28581815, 1.        , 0.11515259],
       [0.14146548, 0.22493379, 0.11412322, 0.12165388, 0.14378738,
        0.09991933, 0.11515259, 1.        ]])

3. A class MinHashing that builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).

In [59]:
# test prime
def isPrime(c):
    """
    input: n, an arbitrary number to test prime.
    """
    if c==2 or c==3: return True
    if c%2==0 or c<2: return False
    for i in range(3, int(c**0.5)+1, 2):   # only odd numbers
        if c%i==0:
            return False    
    return True

In [92]:
# generate n random hash functions in the form ax+b mod c
# where c is a prime number
def random_hash(n, max_value=100000):
    """
    input: argument n, number of hash functions.
            max_value, maximum value can be picked randomly.
    """
    
    rand_list = np.zeros((n,3)) # nX3 matrix, columns corresponding to a, b and c
    rand_list = rand_list.tolist()
    
    for i in range(n):
        a = random.randint(1, max_value) 
        b = random.randint(1, max_value) 
        primes = [i for i in range(1,max_value) if isPrime(i)]
        c = random.choice(primes) 
        
        while [a,b,c] in rand_list:
            a = random.randint(1, max_value) 
            b = random.randint(1, max_value) 
            primes = [i for i in range(1,max_value) if isPrime(i)]
            c = random.choice(primes) 
        
        rand_list[i] = [a, b, c]
    
    return rand_list

In [93]:
# minhashing: build signature matrix
# SIG(i,c) is element for the ith hash function and column c
num_hash = 1000
rand_list = random_hash(num_hash) # each entry is a [a, b, c] list

signature_matrix = np.zeros((num_hash, len(file_list)))
for j, text in enumerate(shingle_int): # for each column/text
    #print(j)
    
    for i in range(num_hash): # for each row/hash function
        minhash = np.inf # initially set SIG(i,c) to inf for all
        for element in text: # iteratively compare and keep the smaller of SIG(i,c) and hi(r)
            new_hash = (rand_list[i][0] * element + rand_list[i][1]) % rand_list[i][2] # ax+b mod c
            #print(rand_list[i], new_hash)
            
            if new_hash < minhash:
                minhash = new_hash
                #print(minhash)
                
        signature_matrix[i][j] = minhash
print(f'shape of signature matrix: {signature_matrix.shape}')

shape of signature matrix: (1000, 8)


4. A class CompareSignatures that estimates similarity of two integer vectors – minhash signatures – as a fraction of components, in which they agree.

In [94]:
# compare columns in signature matrix
def vector_similarity(vector1, vector2):
    count = 0
    for i in range(len(vector1)):
        if vector1[i] == vector2[i]:
            count += 1
    return count/len(vector1)

In [95]:
# build signature similarity matrix
signature_similarity_matrix = np.zeros((len(file_list), len(file_list)))
trans_signature_matrix = np.transpose(signature_matrix)
for i in range(len(file_list)):
    for j in range(len(file_list)):
        signature_similarity_matrix[i][j] = vector_similarity(
            trans_signature_matrix[i], trans_signature_matrix[j])

In [96]:
print(signature_similarity_matrix)

[[1.    0.252 0.275 0.304 0.307 0.28  0.277 0.234]
 [0.252 1.    0.229 0.229 0.243 0.209 0.218 0.277]
 [0.275 0.229 1.    0.444 0.302 0.39  0.437 0.237]
 [0.304 0.229 0.444 1.    0.297 0.449 0.412 0.236]
 [0.307 0.243 0.302 0.297 1.    0.288 0.29  0.25 ]
 [0.28  0.209 0.39  0.449 0.288 1.    0.418 0.239]
 [0.277 0.218 0.437 0.412 0.29  0.418 1.    0.218]
 [0.234 0.277 0.237 0.236 0.25  0.239 0.218 1.   ]]


5. (Optional task for extra 2 bonus) A class LSH that implements the LSH technique: given a collection of minhash signatures (integer vectors) and a similarity threshold t, the LSH class (using banding and hashing) finds all candidate pairs of signatures that agree on at least fraction t of their components.

In [None]:
random.randint(1,100)