In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import os
import math
import string
import time
import sys
import glob
import random
import hashlib as hl
import tracemalloc
nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dongj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = os.getcwd()
testfilesd = glob.glob(path + "/dataset/duplicatetests/*.tsv")
testfilesk = os.listdir('dataset/twoktests/')

In [3]:
testfilesd

['C:\\Users\\dongj\\Desktop\\JHU_Course\\Fall_2022\\Information_Retrieval\\PA\\PA_5/dataset/duplicatetests\\hundred.tsv',
 'C:\\Users\\dongj\\Desktop\\JHU_Course\\Fall_2022\\Information_Retrieval\\PA\\PA_5/dataset/duplicatetests\\hundredk.tsv',
 'C:\\Users\\dongj\\Desktop\\JHU_Course\\Fall_2022\\Information_Retrieval\\PA\\PA_5/dataset/duplicatetests\\onek.tsv',
 'C:\\Users\\dongj\\Desktop\\JHU_Course\\Fall_2022\\Information_Retrieval\\PA\\PA_5/dataset/duplicatetests\\tenk.tsv',
 'C:\\Users\\dongj\\Desktop\\JHU_Course\\Fall_2022\\Information_Retrieval\\PA\\PA_5/dataset/duplicatetests\\thirty.tsv',
 'C:\\Users\\dongj\\Desktop\\JHU_Course\\Fall_2022\\Information_Retrieval\\PA\\PA_5/dataset/duplicatetests\\thirtyk.tsv',
 'C:\\Users\\dongj\\Desktop\\JHU_Course\\Fall_2022\\Information_Retrieval\\PA\\PA_5/dataset/duplicatetests\\threehundred.tsv',
 'C:\\Users\\dongj\\Desktop\\JHU_Course\\Fall_2022\\Information_Retrieval\\PA\\PA_5/dataset/duplicatetests\\threek.tsv']

In [4]:
thirty_df_orig = pd.read_csv(testfilesd[4],sep='\t+|\n',header=None, names=['Context'])
hundred_df_orig = pd.read_csv(testfilesd[0],sep='\t+|\n',header=None, names=['Context'])
threehundred_df_orig = pd.read_csv(testfilesd[6],sep='\t+|\n',header=None, names=['Context'])
onek_df_orig = pd.read_csv(testfilesd[2],sep='\t+|\n',header=None, names=['Context'])
threek_df_orig = pd.read_csv(testfilesd[7],sep='\t+|\n',header=None, names=['Context'])
tenk_df_orig = pd.read_csv(testfilesd[3],sep='\t+|\n',header=None, names=['Context'])
thirtyk_df_orig = pd.read_csv(testfilesd[5],sep='\t+|\n',header=None, names=['Context'])
hundredk_df_orig = pd.read_csv(testfilesd[1],sep='\t+|\n',header=None, names=['Context'])

thirty_df = thirty_df_orig.copy()
hundred_df = hundred_df_orig.copy()
threehundred_df = threehundred_df_orig.copy()
onek_df = onek_df_orig.copy()
threek_df = threek_df_orig.copy()
tenk_df = tenk_df_orig.copy()
thirtyk_df = thirtyk_df_orig.copy()
hundredk_df = hundredk_df_orig.copy()

In [5]:
# Normalization
#     lower-case words
#     Change short term to long terms for verb.
#     remove punctuation
#         https://www.geeksforgeeks.org/python-remove-punctuation-from-string/

def normalization(word):
    word = word.replace("'",' ')
    word = word.replace("'re",' are').replace("'m'", ' am').replace("'s",' is').replace("n't",' not').replace("'ve",' have').replace("'d",' had').replace("'ll",' will')
    word  = re.sub(r'[^\w\s]', ' ', word)
    word = word.translate(str.maketrans('', '', string.punctuation))
    return word

In [6]:
# Preprocess dataset 
#   Normalization
#   Removed Stop words 
def preprocess(data):
    result = []
    for line in data:
        word = normalization(line)
        word = word.lower().strip().split()
        stopwords = nltk.corpus.stopwords.words("english")
        word = [w for w in word if not w in stopwords]
        word = " ".join(word)
        result.append(word)
    return result

In [7]:
def N_gram(text, N):
    grams_list=set()
    text = text.split()
    for i in range(len(text)-N+1):
        shingle = text[i:i+N]
        shingle = ' '.join(shingle)
        grams_list.add(shingle)
    return grams_list

In [8]:
def hashing(text):
    return int.from_bytes(hl.sha256(text.encode("utf-8")).digest()[:8], 'little') # 64-bit int

In [9]:
def make_random_hash_fn(N):
    a = random.randint(N+1,(2**64 - N))
    b = random.randint(N,(2**64 - N))
    return lambda x: (a * x + b) % N

In [10]:
def make_hashes(N, num_hashes):
    list_of_hash_fn=[]
    for i in range(num_hashes):
        list_of_hash_fn.append(make_random_hash_fn(N))
    return list_of_hash_fn 

In [11]:
def shingledhash(df, N):
    docsAsShingleSets = {}
    for i in range(1,len(df)+1):
        n_gram_list=N_gram(df[i], N)
        shinglesInDoc = set()
        for j in range(len(n_gram_list)):
            shinglesInDoc.add(hashing(list(n_gram_list)[j]))
        docsAsShingleSets[i] = shinglesInDoc
    return docsAsShingleSets

In [12]:
def shingles_doc(shingled_documents):
    list_of_tuples = []
    list_of_documentid =[]
    for i in shingled_documents:
        list_of_documentid.append(i)
        for j in shingled_documents[i]:
            list_of_tuples.append((j, i))
    list_of_tuples.sort()
    return list_of_tuples, list_of_documentid

In [13]:
def make_minhash_signature(shingled_data, num_hash):
    inv_index, docids = shingles_doc(shingled_data)
    num_docs = len(docids)
    sigmatrix = np.full([num_hash, num_docs], np.inf)  
    hash_funcs = make_hashes(len(inv_index), num_hash)
    for row, docid in inv_index:
        for row1 in range(num_hash):
            sigmatrix[row1,docids.index(docid)]=min(sigmatrix[row1,docids.index(docid)],hash_funcs[row1](row))
    return sigmatrix

In [14]:
def jacard_similarity(id1, id2, minhash_sigmat, docids):
    return np.mean(minhash_sigmat[:, docids.index(id1)]==minhash_sigmat[:, docids.index(id2)]) 

In [15]:
def near_document(df, sigmin, threshold):
    near_docs=[]
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            minhash_similar=jacard_similarity(list(df.keys())[i], list(df.keys())[j], sigmin, list(df.keys()))
            minhash_tuples = [list(df.keys())[i], list(df.keys())[j],minhash_similar]
            if minhash_similar>=threshold:
                near_docs+=[minhash_tuples]
    return near_docs

In [16]:
def cluster_output(df, N):
    comb_doc=[]
    for i in range(1,N+1):
        near =[]
        for j in df:
            if j[0] == i:
                near.append(j[0])
                near.append(j[1])
            else:
                continue
        comb_doc_list =[]
        for k in df:
            for g in near:
                if k[0]==g:
                    comb_doc_list.append(k[0])
                    comb_doc_list.append(k[1])
        comb_doc.append(comb_doc_list)
    sorted_doc =[]
    for i in tz3:
        if len(i) !=0:
            sorted_doc.append(sorted(set(i)))
    first_doc=[]
    for i in sorted_doc:
        for j in i[1:]:
            first_doc.append(j)
    merge_doc =[]
    for i in sorted_doc:
        for j in first_doc:
            if i[0] ==j:
                merge_doc.append(i)
    near_ducplicate_list =[]
    for i in sorted_doc:
        if i not in merge_doc:
            near_ducplicate_list.append(i)
    return near_ducplicate_list

In [17]:
def near_duplicate_doc(near_doc, N):
    doc_id =[]
    for i in near_doc:
        for j in i:
            doc_id.append(j)  
    with open('output/dcho13-'+str(N)+'.txt', 'w') as f:
        for k in range(1, N+1):
            near =[]
            for i in near_doc:
                if i[0] == k:
                    near.append(i)
            if len(near) !=0:
                print(*near[0], file=f)
            if k in doc_id:
                continue
            else:
                print(k, file=f)
    print('output/dcho13-'+str(N)+'.txt is created')

In [None]:
%%time
thirty_df['processed_context'] = preprocess(thirty_df['Context'])
shingled_list_30= shingledhash(thirty_df['processed_context'], 3)
minhash_sigmat_30=make_minhash_signature(shingled_list_30, 200)
near_30 = near_document(shingled_list_30, minhash_sigmat_30,0.35)

In [None]:
output_30 = cluster_output(near_30, 30)
near_duplicate_doc(output_30,30)

In [None]:
%%time
hundred_df['processed_context'] = preprocess(hundred_df['Context'])
shingled_list_100= shingledhash(hundred_df['processed_context'], 3)
minhash_sigmat_100=make_minhash_signature(shingled_list_100, 200)
near_100 = near_document(shingled_list_100, minhash_sigmat_100, 0.35)

In [None]:
output_100 = cluster_output(near_100, 100)
near_duplicate_doc(output_100,100)

In [None]:
%%time
threehundred_df['processed_context'] = preprocess(threehundred_df['Context'])
shingled_list_300= shingledhash(threehundred_df['processed_context'], 3)
minhash_sigmat_300=make_minhash_signature(shingled_list_300, 200)
near_300 = near_document(shingled_list_300, minhash_sigmat_300, 0.35)

In [None]:
output_300 = cluster_output(near_300, 300)
near_duplicate_doc(output_300,300)

In [None]:
%%time
onek_df['processed_context'] = preprocess(onek_df['Context'])
shingled_list_1000= shingledhash(onek_df['processed_context'], 3)
minhash_sigmat_1000=make_minhash_signature(shingled_list_1000, 200)
near_1000 = near_document(shingled_list_1000, minhash_sigmat_1000, 0.35)

In [None]:
output_1000 = cluster_output(near_1000, 1000)
near_duplicate_doc(output_1000,1000)

In [None]:
path = os.getcwd()
testfilesd2= glob.glob(path + "/dataset/twoktests/*.tsv")

In [None]:
twok_df_orig = pd.read_csv(testfilesd2[0],sep='\t+|\n',header=None, names=['Context'])
twok_df = twok_df_orig.copy()

In [None]:
%%time
twok_df['processed_context'] = preprocess(twok_df['Context'])
shingled_list_2000= shingledhash(twok_df['processed_context'], 3)
minhash_sigmat_2000=make_minhash_signature(shingled_list_2000, 100)
near_2000 = near_document(shingled_list_2000, minhash_sigmat_2000, 0.35)

In [None]:
output_2000 = cluster_output(near_2000, 2000)
near_duplicate_doc(output_2000,2000)

In [None]:
%%time
threek_df['processed_context'] = preprocess(threek_df['Context'])
shingled_list_3000= shingledhash(threek_df['processed_context'], 3)
minhash_sigmat_3000=make_minhash_signature(shingled_list_3000, 200)
near_3000 = near_document(shingled_list_3000, minhash_sigmat_3000, 0.35)

In [None]:
output_3000 = cluster_output(near_3000, 3000)
near_duplicate_doc(output_3000,3000)

In [None]:
%%time
tenk_df['processed_context'] = preprocess(tenk_df['Context'])
shingled_list_10000= shingledhash(tenk_df['processed_context'], 3)
minhash_sigmat_10000=make_minhash_signature(shingled_list_10000, 200)
near_10000 = near_document(shingled_list_10000, minhash_sigmat_10000, 0.35)

In [None]:
output_10000 = cluster_output(near_10000, 10000)
near_duplicate_doc(output_10000,10000)

In [None]:
%%time
thirtyk_df['processed_context'] = preprocess(thirtyk_df['Context'])
shingled_list_30000= shingledhash(thirtyk_df['processed_context'], 3)
minhash_sigmat_30000=make_minhash_signature(shingled_list_30000, 200)
near_30000 = near_document(shingled_list_30000, minhash_sigmat_30000, 0.35)

In [None]:
output_30000 = cluster_output(near_30000, 30000)
near_duplicate_doc(output_30000,30000)

In [None]:
%%time
hundredk_df['processed_context'] = preprocess(hundredk_df['Context'])
shingled_list_100000= shingledhash(hundredk_df['processed_context'], 3)
minhash_sigmat_100000=make_minhash_signature(shingled_list_100000, 200)
near_100000 = near_document(shingled_list_100000, minhash_sigmat_100000, 0.35)

In [None]:
output_100000 = cluster_output(near_100000, 100000)
near_duplicate_doc(output_100000,100000)