In [12]:
#--------------------------------------------------------------------------------------
# Code required towards ground work for Part 2
#-------------------------------------------------------------------------------------- 

import string
import binascii
import pandas as pd
from sklearn import linear_model                            
from matplotlib import pyplot as plt
import random
import numpy

#--------------------------------------------------------------------------------------
# Dataset parsing
#-------------------------------------------------------------------------------------- 
def parse_data(testfile):                                    # testfile = Path of file, stored locally on my system
    final_list = []
    fd = open(testfile,"r")                                  # opening file in read mode
    for line in fd:
        ids = []
        text = []
        tup = ()
        a = line.split()                                    # getting the contents of the first line in the document
        ids.append(a[0])                                    # extracting the document ID
        idtup = tuple(ids)
        del a[0]
        article = ''
        for word in a:
            word = word.lower()                             # converting to lower case
            word = word.translate(str.maketrans('','',string.punctuation)) # removing all punctuations
            article = article + word                        # creating a concatenated string of all the words in the article
        text.append(article)
        articletup = tuple(text)
        tup = idtup + articletup
        final_list.append(tup)
    return tuple(final_list)
#--------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------
# Creating Document Shingles
#-------------------------------------------------------------------------------------- 
def shingle_document(t,k):
    docAsShingles = {}
    for i in t:
        docID = i[0]                                        # Extracting the document ID
        s = i[1]                                            # The article text
        shinglesInArticle = set()
        for j in range(0,len(s)-k+1):                       # looping through each character
            shingle = ''
            for m in range(j,j+k):                          # Loop to create shingles
                shingle = shingle + s[m]
            crc = binascii.crc32(shingle.encode()) & 0xffffffff  # To get the 32-bit hashed integer
            shinglesInArticle.add(crc)                      # Adding the shingles to a set
        docAsShingles[docID] = shinglesInArticle            # Storing values in a dict object
    return docAsShingles
#--------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------
# Computing Jaccard Similarity
#-------------------------------------------------------------------------------------- 
def jaccard(l):
    flist = []
    k = l.items()                                           # To extract shingle and docIDs
    id = [x for x,_ in k]                                   # Extracting the document IDs through a list comprehension
    for i in range(0,len(id)):
        id1 = tuple([id[i]])                                # Extracting 1st DocID, converting to a list, then to a tuple
        s1 = l[id[i]]
        for j in range(i+1,len(id)):
            m = []
            tup = ()
            id2 = tuple([id[j]])
            s2 = l[id[j]]
            js = (len(s1 & s2)/len(s1 | s2))                # Computing Jaccard Similarity
            if js > 0.99:
                plagiarism = ['Yes']                        # Setting Plagiarism Flag to Yes if JS value>0.99
            else:
                plagiarism = ['No']
            m.append(js)
            tup = id1 + id2 + tuple(m) + tuple(plagiarism)
            flist.append(tup)
    return tuple(flist)
#--------------------------------------------------------------------------------------

#--------------------------------------------------------------------------------------
# Start of <<<<<<<<<<<_Part 2_>>>>>>>>>>>
#-------------------------------------------------------------------------------------- 

#--------------------------------------------------------------------------------------
# Part 2A Preparing Shingles for MinHash
#--------------------------------------------------------------------------------------

def invert_shingles(l):                                     # Argument l = The dict containing (docID, shingle) pairs
    flist = []
    k = l.items()                                           # To extract shingle and docIDs, returns list of tuple pairs
    id = [x for x,_ in k]
    for i in range(0,len(id)):
        docid = tuple([id[i]])
        shingles = l[id[i]]
        m = []
        for s in shingles:
            tup = ()
            tup = tuple([s]) + docid
            m.append(tup)
        m.sort(key = lambda tup: tup[0])                    # Sorting the tuple entries
        flist.append(m)
    return id, tuple(flist)

#--------------------------------------------------------------------------------------
# Part 2B Generate Hash Functions
#--------------------------------------------------------------------------------------

DEFAULT_P = 2**33-355
DEFAULT_M = 4294967295

def getHashCoeffs(num_hashes, m=DEFAULT_M):
    randList = []
    while num_hashes > 0:
        randIndex = random.randint(0, m)
        while randIndex in randList:
            randIndex = random.randint(0, m)
        randList.append(randIndex)
        num_hashes = num_hashes - 1
    return randList

#--------------------------------------------------------------------------------------
# Part 2C Construct the MinHash Signature Matrix
#--------------------------------------------------------------------------------------

def make_minhash_signature(l, id, a, b, num_hashes, p=DEFAULT_P):
    signatures = []                                                   # Master list to hold signatures
    for i in range(0,len(id)):
        docid = tuple([id[i]])                                  
        shingles = l[id[i]]
        signature = []
        for j in range(0,num_hashes):                                 # Looping through all the hash functions
            minHashCode = p + 1
            for s in shingles:
                hashCode = (a[i] * s + b[i]) % p                      # Applying Hashfunction to shingle s
                if hashCode < minHashCode:
                    minHashCode = hashCode
            signature.append(minHashCode)
        signatures.append(signature)
    return signatures

#--------------------------------------------------------------------------------------
# Part 2D Getting MinHash Similarity Estimate
#--------------------------------------------------------------------------------------
        
def minhash_similarity(id, num_hashes, signatures):
    result = []
    for i in range(0, len(id)):
        signature1 = signatures[i]
        id1 = tuple([id[i]])
        for j in range(i + 1, len(id)):
            m = []
            tup = ()
            signature2 = signatures[j]
            id2 = tuple([id[j]])
            count = 0
            for k in range(0, num_hashes):
                count = count + (signature1[k] == signature2[k])   # getting the count of min hash values for a pair of docs
            sim = (count / num_hashes)
            if sim > 0.99:                                  # Setting Plagiarism flag to Yes if sim > 0.99 (threshold value) 
                plagiarism = ['Yes']                        
            else:
                plagiarism = ['No']
            m.append(sim)
            tup = id1 + id2 + tuple(m) + tuple(plagiarism)
            result.append(tup)
    return tuple(result)

#--------------------------------------------------------------------------------------
# Part 2E Putting it all together, test results for a subset of 100 file
#--------------------------------------------------------------------------------------

DEFAULT_P = 2**33-355
DEFAULT_M = 4294967295
num_hashes = 10
testfile = r'C:\Vidit\PhD\Fall 2018\CMSC 643 - Hector\Project 1\TestFile.txt'
t = parse_data(testfile)                                                # Parsing data file
l = shingle_document(t,3)                                               # Shingling
j = jaccard(l)                                                          # Getting Jacard values
id, docs = invert_shingles(l)                                           # Inverting the shingles
a = getHashCoeffs(num_hashes, DEFAULT_M)                                # Getting coefficient A
b = getHashCoeffs(num_hashes, DEFAULT_M)                                # Getting coefficient B
signatures = make_minhash_signature(l, id, a, b, num_hashes, DEFAULT_P) # Getting signatures
m = minhash_similarity(id, num_hashes, signatures)                      # Getting minhash similarity estimates
r = []
for b in j:
    j1 = tuple([b[2]])
for i in m:
    i = i + j1
    r.append(i)
df = pd.DataFrame(r, columns=['ID1', 'ID2', 'MinHash Value','Plagiarism','JS Value'])
print ("\n Output of comparison of MinHash and Jacard Values for a small subset of 100 dataset \n")
print (df)

#--------------------------------------------------------------------------------------
# Part 2F Experiment 1
#--------------------------------------------------------------------------------------

def run_experiment(id, signatures,k,
                   num_hashes = [10,20,40,60,80,100,120]):
    testfile = r'C:\Vidit\PhD\Fall 2018\CMSC 643 - Hector\Project 1\TestFile1000.txt'
    for i in num_hashes:
        t = parse_data(testfile)                                                # Parsing data file
        l = shingle_document(t,k)                                               # Shingling
        j = jaccard(l)                                                          # Getting Jacard values
        id, docs = invert_shingles(l)                                           # Inverting the shingles
        a = getHashCoeffs(i, DEFAULT_M)                                         # Getting coefficient A
        b = getHashCoeffs(i, DEFAULT_M)                                         # Getting coefficient B
        signatures = make_minhash_signature(l, id, a, b, num_hashes, DEFAULT_P) # Getting signatures
        m = minhash_similarity(id, i, signatures)                               # Getting minhash similarity estimates
        r = []
        for b in j:
            j1 = tuple([b[2]])
        for k in m:
            k = i + j1
            r.append(k)
            r.append(i)
    df = pd.DataFrame(r, columns=['ID1', 'ID2', 'MinHash Value','Plagiarism','JS Value','Num_Hashes'])
    return df


 Output of comparison of MinHash and Jacard Values for a small subset of 100 dataset 

      ID1    ID2  MinHash Value Plagiarism  JS Value
0    t980  t1088            0.0         No  0.245677
1    t980  t1233            0.0         No  0.245677
2    t980  t1235            0.0         No  0.245677
3    t980  t1297            0.0         No  0.245677
4    t980  t1768            0.0         No  0.245677
5   t1088  t1233            0.0         No  0.245677
6   t1088  t1235            0.0         No  0.245677
7   t1088  t1297            0.0         No  0.245677
8   t1088  t1768            0.0         No  0.245677
9   t1233  t1235            0.0         No  0.245677
10  t1233  t1297            0.0         No  0.245677
11  t1233  t1768            0.0         No  0.245677
12  t1235  t1297            0.0         No  0.245677
13  t1235  t1768            0.0         No  0.245677
14  t1297  t1768            0.0         No  0.245677
