original dataset: Opinosis Opinion ⁄ Review Data Set https://archive.ics.uci.edu/ml/datasets/Opinosis+Opinion+%26frasl%3B+Review

In [194]:
# encoding: utf-8

import numpy as np
import pandas as pd
import string
import itertools
import hashlib

In [195]:
# 8 text files we choose to use
file_list = ['battery-life_amazon_kindle',
            'battery-life_ipod_nano_8gb',
            'location_bestwestern_hotel_sfo',
            'location_holiday_inn_london',
            'price_amazon_kindle',
            'room_holiday_inn_london',
            'rooms_bestwestern_hotel_sfo',
            'screen_ipod_nano_8gb']

In [196]:
# I/O function
def import_dataset(file_name):
    """
    input: file_name in file_list
    """
    
    dataFile = './dataset/' + str(file_name) + '.txt.data'
    with open(dataFile, 'rb') as f:
        contents = []
        for line in f.readlines():
            contents.append(line.strip().decode('utf-8'))
    return contents

In [197]:
# import dataset
print('Importing dataset')

dataset = [] # 8-entry list, each corresponding to a file
for file_name in file_list:
    contents = import_dataset(file_name)
    dataset.append(contents)
    
print('document number:' + str(len(dataset)))

Importing dataset
document number:8


In [198]:
# function constructs k-shingles from a given text
def shingle(text, k=5):
    """
    input: text in dataset
            k, shingle size.
    """
    
    # split the string into separate words
    split_text = []
    for review in text:
        exclude = set(string.punctuation)
        review = ''.join(ch for ch in review if ch not in exclude) # remove punctuation
        review = review.lower() # convert all characters into lower characters
        for word in review.split():
            split_text.append(word)
            
    # k-shingle
    shingle_list = []
    for i in range(len(split_text)-k+1):
        shingle_list.append(split_text[i:i+k])

    # remove duplicates
    shingle_list = list(dict.fromkeys(shingle_list))
    
    # each sublist in shingle_no_dup represents a shingle
    # convert the sublist into a string
    shingle_strings = []
    for index, shingle in enumerate(shingle_no_dup):
        sum_string = shingle[0]
        for i in range(1, len(shingle)):
            sum_string = sum_string + ' ' + shingle[i]
        shingle_strings.append(sum_string)
        
    return shingle_strings

In [215]:
# shingling all 8 texts
print('Shingling')

shingle_texts = []
for text in dataset:
    shingle_texts.append(shingle(text))
    
for i in range(len(shingle_texts)):
    num = len(shingle_texts[i])
    print(f'number of shingles in document {i+1} is {num}')

Shingling
number of shingles in document 1 is 1771
number of shingles in document 2 is 1136
number of shingles in document 3 is 5045
number of shingles in document 4 is 6165
number of shingles in document 5 is 1941
number of shingles in document 6 is 11948
number of shingles in document 7 is 4540
number of shingles in document 8 is 1043


In [221]:
# flatting the list of all shingles
flat_shingle_texts = np.hstack(np.array(shingle_texts))
print('number of shingles:',flat_shingle_texts.shape[0])

# get unique shingles
unique_flat_shingle_texts = np.unique(flat_shingle_texts)
max_value = unique_flat_shingle_texts.shape[0]
print('number of unique shingles:', max_value)

number of shingles: 33589
number of unique shingles: 32678


In [238]:
# hash unique shingles
# build dictionary with {unique shingle: hash value}
for i in range(len(unique_flat_shingle_texts)):
    shingle_dict[unique_flat_shingle_texts[i]] = hash(unique_flat_shingle_texts[i])

In [None]:
# represents the document in the form of anordered set of its hashed k-shingles
