# (Near-) Duplicate detection

## Import Packages

In [1]:
import glob
import json
import string
import re
#import binascii
import pandas as pd

In [2]:
# Define the files pattern and read the json files using glob package
pattern = 'bnl-*.json'
files = glob.glob(pattern)
jsonSourceObject = {}

for file in files:
    with open(file) as jsonFile:
        jsonObject = json.load(jsonFile)
        jsonObjectPublisher = jsonObject['_source']['item']['provider']['id']        
        jsonSourceObject[jsonObjectPublisher] = jsonObject['_source']['item']['body']
        print("The file %s" % (file) +" has been successfuly read!")
        jsonFile.close()

The file bnl-adn-20210908-13103301.json has been successfuly read!
The file bnl-bddenbosch-20210908-13103301.json has been successfuly read!
The file bnl-ttenschede-20210908-13103592.json has been successfuly read!


In [3]:
# Storing all contents in list 
rows = []
for publisher_source, publisher_source_articles in jsonSourceObject.items():
    for i in range(0, len(publisher_source_articles)):
        rows.append([
            publisher_source,
            publisher_source_articles[i]['content']
        ])

In [4]:
# Store the rows in a dataframe
df = pd.DataFrame(rows, columns=['publisher','content'])

In [5]:
def clean_content(content):
    """ Clean the content string """
    # Remove any punctuation mark
    content = content.translate(str.maketrans('', '', string.punctuation)) 
    # Transform all leters to lower cases
    content = content.lower()
    # Convert extra white spaces as a single blank
    content = re.sub(' +', ' ', content)
    
    return content

In [6]:
# Apply the clean_content function the the content column in the dataframe
df['cleaned_content'] = df['content'].apply(clean_content)
df.head()

Unnamed: 0,publisher,content,cleaned_content
0,adn,Kamerlid Pieter Omtzigt wil niet terug naar we...,kamerlid pieter omtzigt wil niet terug naar we...
1,adn,Pieter Omtzigt is al meer dan honderd dagen th...,pieter omtzigt is al meer dan honderd dagen th...
2,adn,'Ik heb lang tegen mijn besluit aangehikt',ik heb lang tegen mijn besluit aangehikt
3,adn,Kelly Adams Enschede,kelly adams enschede
4,adn,De bospaden en parken rondom zijn huis kent Pi...,de bospaden en parken rondom zijn huis kent pi...


# Shingling process
    
    

In [7]:
def get_content_shingles(text, size):
    """ Converts a string to a set of size-shingles """
    tokens = text.split()
    return [' '.join(tokens[i:i+size])
                     for i in range(len(tokens) - size + 1)]

In [8]:
# Construct shingles for every content within the dataframe
shingle_order = 3
df['shingles'] = df['cleaned_content'].apply(get_content_shingles, size=shingle_order)  

In [9]:
df.head()

Unnamed: 0,publisher,content,cleaned_content,shingles
0,adn,Kamerlid Pieter Omtzigt wil niet terug naar we...,kamerlid pieter omtzigt wil niet terug naar we...,"[kamerlid pieter omtzigt, pieter omtzigt wil, ..."
1,adn,Pieter Omtzigt is al meer dan honderd dagen th...,pieter omtzigt is al meer dan honderd dagen th...,"[pieter omtzigt is, omtzigt is al, is al meer,..."
2,adn,'Ik heb lang tegen mijn besluit aangehikt',ik heb lang tegen mijn besluit aangehikt,"[ik heb lang, heb lang tegen, lang tegen mijn,..."
3,adn,Kelly Adams Enschede,kelly adams enschede,[kelly adams enschede]
4,adn,De bospaden en parken rondom zijn huis kent Pi...,de bospaden en parken rondom zijn huis kent pi...,"[de bospaden en, bospaden en parken, en parken..."


## Jaccard Similarity

In [10]:
def jaccard(set1, set2):
    """ Computes the Jaccard similarity between two sets """
    x = len(set1.intersection(set2))
    y = len(set1.union(set2))
    return x / y

In [11]:
def get_duplicate_type(x):
    """ Returns the duplicate's type 
    Returns one from: near duplicate, perfect duplicate"""
    type_ = 'near duplicate'
    if x==1:
        type_ = 'perfect duplicate'
    return type_
    

In [12]:
near_duplicates_items = []
minimum_similarity_threshold = 0.8
for i in range(len(df)):
    for j in range(i+1, len(df)):
        try:
            jaccard_similarity = jaccard(set(df['shingles'][i]), set(df['shingles'][j]))
            if jaccard_similarity>=minimum_similarity_threshold:
                near_duplicates_items.append([                   
                    df['publisher'][i],
                    df['content'][i],
                    df['publisher'][j],
                    df['content'][j],
                    jaccard_similarity,
                    get_duplicate_type(jaccard_similarity)
                ])
                #print(i, j, jaccard_similarity)
        except Exception as e:
            print(e)

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero


In [13]:
near_duplicates_result = pd.DataFrame(near_duplicates_items, columns=['Publisher', 'Content', 'Publisher', 'Content', 'Jaccard Similarity', 'Duplicate Type'])

In [14]:
near_duplicates_result.to_csv('near_duplicates_result.csv')

In [15]:
near_duplicates_result.head()

Unnamed: 0,Publisher,Content,Publisher.1,Content.1,Jaccard Similarity,Duplicate Type
0,adn,Kamerlid Pieter Omtzigt wil niet terug naar we...,bddenbosch,Kamerlid Pieter Omtzigt wil niet terug naar we...,1.0,perfect duplicate
1,adn,Pieter Omtzigt is al meer dan honderd dagen th...,bddenbosch,Pieter Omtzigt is al meer dan honderd dagen th...,1.0,perfect duplicate
2,adn,'Ik heb lang tegen mijn besluit aangehikt',bddenbosch,'Ik heb lang tegen mijn besluit aangehikt',1.0,perfect duplicate
3,adn,De bospaden en parken rondom zijn huis kent Pi...,bddenbosch,De bospaden en parken rondom zijn huis kent Pi...,1.0,perfect duplicate
4,adn,Maandenlang hield de politicus zich stil. Hij ...,bddenbosch,Maandenlang hield de politicus zich stil. Hij ...,1.0,perfect duplicate
