original dataset: Opinosis Opinion ⁄ Review Data Set https://archive.ics.uci.edu/ml/datasets/Opinosis+Opinion+%26frasl%3B+Review

In [69]:
# encoding: utf-8

import numpy as np
import pandas as pd
import string
import itertools
import hashlib

In [70]:
# 8 text files we choose to use
file_list = ['battery-life_amazon_kindle',
            'battery-life_ipod_nano_8gb',
            'location_bestwestern_hotel_sfo',
            'location_holiday_inn_london',
            'price_amazon_kindle',
            'room_holiday_inn_london',
            'rooms_bestwestern_hotel_sfo',
            'screen_ipod_nano_8gb']

In [71]:
# I/O function
def import_dataset(file_name):
    """
    input: file_name in file_list
    """
    
    dataFile = './dataset/' + str(file_name) + '.txt.data'
    with open(dataFile, 'rb') as f:
        contents = []
        for line in f.readlines():
            contents.append(line.strip().decode('utf-8'))
    return contents

In [72]:
# import dataset
print('Importing dataset')

dataset = [] # 8-entry list, each corresponding to a file
for file_name in file_list:
    contents = import_dataset(file_name)
    dataset.append(contents)
    
print('document number:' + str(len(dataset)))

Importing dataset
document number:8


1. A class Shingling that constructs k–shingles of a given length k (e.g., 10) from a given document, computes a hash value for each unique shingle, and represents the document in the form of an ordered set of its hashed k-shingles.

In [73]:
# function constructs k-shingles from a given text
def shingling(text, k=5):
    """
    input: text in dataset
            k, shingle size.
    """
    
    # split the string into separate words
    split_text = []
    for review in text:
        exclude = set(string.punctuation)
        review = ''.join(ch for ch in review if ch not in exclude) # remove punctuation
        review = review.lower() # convert all characters into lower characters
        #for word in review.split():
        #    split_text.append(word)
        for character in list(review):
            split_text.append(character)
    
    # k-shingle
    shingle_list = []
    for i in range(len(split_text)-k+1):
        shingle_list.append(split_text[i:i+k])

    # remove duplicates
    shingle_list.sort()
    shingle_no_dup = list(shingle_list for shingle_list,_ in itertools.groupby(shingle_list))
    
    # each sublist in shingle_no_dup represents a shingle
    # convert the sublist into a string
    shingle_strings = []
    for index, shingle in enumerate(shingle_no_dup):
        sum_string = shingle[0]
        for i in range(1, len(shingle)):
            sum_string = sum_string + ' ' + shingle[i]
        shingle_strings.append(sum_string)
        
    return shingle_strings

In [74]:
# shingling all 8 texts
print('Shingling')

shingle_texts = []
for text in dataset:
    shingle_texts.append(shingling(text))
    
for i in range(len(shingle_texts)):
    num = len(shingle_texts[i])
    print(f'number of shingles in document {i+1} is {num}')

Shingling
number of shingles in document 1 is 4838
number of shingles in document 2 is 3228
number of shingles in document 3 is 8507
number of shingles in document 4 is 9910
number of shingles in document 5 is 5360
number of shingles in document 6 is 17206
number of shingles in document 7 is 9323
number of shingles in document 8 is 3247


In [75]:
# flatting the list of all shingles
flat_shingle_texts = np.hstack(np.array(shingle_texts))
print('number of shingles:',flat_shingle_texts.shape[0])

# get unique shingles
unique_flat_shingle_texts = np.unique(flat_shingle_texts)
max_value = unique_flat_shingle_texts.shape[0]
print('number of unique shingles:', max_value)

number of shingles: 61619
number of unique shingles: 30623


In [76]:
# hash unique shingles
# build dictionary with {unique shingle: hash value}
shingle_dict = {}
for i in range(len(unique_flat_shingle_texts)):
    shingle_dict[unique_flat_shingle_texts[i]] = hash(unique_flat_shingle_texts[i])

In [77]:
# represents the document in the form of an ordered set of its hashed k-shingles
shingle_int = shingle_texts
for i in range(len(shingle_int)):
    for j in range(len(shingle_int[i])):
        shingle_int[i][j] = shingle_dict[shingle_texts[i][j]]

2. A class CompareSets that computes the Jaccard similarity of two sets of integers – two sets of hashed shingles.

In [82]:
# compute Jaccard similarity of two sets of hashed shingles
def jaccard_similarity(text1, text2):
    """
    input: text1, text2 are entries in shingle_int. eg: text1 = shingle_int[0]
    """
    set1 = set(text1)
    set2 = set(text2)
    jaccard = len(set1.intersection(set2)) / len(set1.union(set2))
    return jaccard

3. A class MinHashing that builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).

4. A class CompareSignatures that estimates similarity of two integer vectors – minhash signatures – as a fraction of components, in which they agree.

5. (Optional task for extra 2 bonus) A class LSH that implements the LSH technique: given a collection of minhash signatures (integer vectors) and a similarity threshold t, the LSH class (using banding and hashing) finds all candidate pairs of signatures that agree on at least fraction t of their components.