In [None]:
#%env PYSPARK_PYTHON=python3

In [None]:
#!pip install webhoseio
#!pip install simhash
#!pip install gensim --user

In [1]:
import webhoseio, os
import gensim, operator
from gensim.models import KeyedVectors
import json
from simhash import Simhash, SimhashIndex
import numpy as np

In [2]:
model_path = '/Github/'

In [3]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model


In [4]:
model_w2v_AP    = load_wordvec_model('Word2Vec Google News', 'GoogleNews-vectors-negative300.bin.gz', True)

Loading Word2Vec Google News model...
Finished loading Word2Vec Google News model...


In [5]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [6]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [7]:
def cleanup(input):
    # remove English stopwords
    input = input.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
    input = re.sub(r'[^a-zA-Z0-9 ]', '', input)
    return input

# Load data 

In [8]:
import json
google_json=open("/Github/google_webhose.json").readlines()

In [9]:
feeds = []
i = 0
for feed in google_json:
    a = json.loads(feed)
    a['id'] = i
    i += 1
    feeds.append(a)

In [10]:
feeds[:10]

[{'thread': {'uuid': 'de512d54a0ccd204ede7a476ea683c25fb6a1e44',
   'url': 'http://en.protothema.gr/ufologist-locates-sunken-flying-saucers-in-thermaikos-gulf-greece-video/',
   'site_full': 'en.protothema.gr',
   'site': 'protothema.gr',
   'site_section': 'http://en.protothema.gr/feed',
   'site_categories': ['travel', 'greece'],
   'section_title': 'protothemanews.com',
   'title': 'Ufologist locates…”sunken flying saucers” in Thermaikos Gulf, Greece (video)',
   'title_full': 'Ufologist locates…”sunken flying saucers” in Thermaikos Gulf, Greece (video)',
   'published': '2020-06-13T19:02:00.000+03:00',
   'replies_count': 0,
   'participants_count': 1,
   'site_type': 'news',
   'country': 'GR',
   'spam_score': 0.0,
   'main_image': 'http://en.protothema.gr/wp-content/uploads/2020/06/pic_1-150x150.png',
   'performance_score': 0,
   'domain_rank': 1064,
   'social': {'facebook': {'likes': 0, 'comments': 0, 'shares': 0},
    'gplus': {'shares': 0},
    'pinterest': {'shares': 0},
 

In [11]:
# Setting parameters
hamming_distance = 20
w2v_score = 0.7

In [12]:
# Creating simhash model
import logging
logging.getLogger('simhash').setLevel(logging.CRITICAL)

objs = [(str(feed['id']), Simhash(str(feed['title']))) for feed in feeds]
index = SimhashIndex(objs, k=hamming_distance)

In [13]:
# finding duplicates

duplist=[]

for feed in range(len(feeds)):
    if int(feed) not in duplist:
        feed_sel = feeds[feed]
        feed_hash = Simhash(str(feed_sel['title']))
        dup_indices = index.get_near_dups(feed_hash) 
        
        for dupi in dup_indices:
            if int(dupi) not in duplist:
                try:
                    score = calc_similarity(feed_sel['title'], feeds[int(dupi)]['title'], model_w2v_AP)
                except:
                    score = 0
                if score >= w2v_score:
                    if int(dupi) not in duplist:
                        if feeds[int(dupi)]['id'] != feed_sel['id']:
                            duplist.append(feeds[int(dupi)]['id'])
print('The original dataset is ' + str(len(feeds)) + ' values')
print('The number of duplicates is ' + str(len(duplist)))
print('The dataset has ' + str(round((len(duplist)/len(feeds)*100),4)) + '% duplicates')

The original dataset is 27798 values
The number of duplicates is 9682
The dataset has 34.8298% duplicates


In [14]:
# Testing if its only pulling out dupes
for x in sorted(duplist):
    print(feeds[x]['id'],feeds[x]['title'])

29 Former SNL comedian Jay Pharoah says he was racially profiled by police, detained with knee on his neck
30 Former SNL comedian Jay Pharoah says he was racially profiled by police, detained with knee on his neck
31 Former SNL comedian Jay Pharoah says he was racially profiled by police, detained with knee on his neck
36 Drone Delivery Service to Drop Books for Virginia Students
38 Drone delivery service to drop books for Virginia students
41 Former SNL comedian Jay Pharoah says he was racially profiled by police, detained with knee on his neck
50 Amazon, Google, Apple, and other device makers are all working with Internet of Things researchers on new ways to protect consumer privacy (GOOG, AAPL, AMZN, MSFT)
65 How tech companies from Google to Salesforce are planning to reopen offices and bring employees back to work in the wake of the coronavirus crisis
92 When Silicon Valley Goes Dark This Time, There Will Be No Refuge
93 How tech companies from Google to Salesforce are planning to

4738 Best YouTube app alternatives for Android phones, TV and Box
4747 Reliance Can Sell 6% To Google Or Microsoft: Which Company Will Grab A Share In Jio? - Trak.in
4750 Covid-19 Killed Online Dating; App Luxy Sees Shift Towards Networking
4753 Amazon sues former AWS employee who joined Google Cloud
4766 The top iPhone and iPad apps on App Store
4770 CVPR 2020 Features Microsoft CEO & SVP, Amazon Web Services in Dialogue on State of AI Commercialization
4777 FAANG Rally Continues as Players Cash in on Coronavirus Crisis
4778 SpaceX rockets fly with software you can find on your Android phone
4782 WhatsApp ‘click to chat’ feature makes number public on Google search: How to protect your number | Technology
4783 WhatsApp Reportedly Fixes Bug That ‘Leaked’ Numbers Via Google Search
4784 Goodwill Is Reopening 21 Southern California Locations With Some Changes! | Valentine In The Morning
4785 Indian-origin Prabhakar Raghavan appointed head of Google Search
4786 [Új] Xiaomi Redmi 10X 5G Sta

9403 Google: State-backed hackers targeted Trump, Biden campaigns
9407 Europe pins hopes on smarter coronavirus contact tracing apps
9409 Hundreds March in Rockford, IL to Demand Justice for George Floyd and All Those Killed by the Police; Black Lives Matter!
9410 State-based hackers targeted Donald Trump, Joe Biden campaigns: Google
9418 Chinese, Iranian hackers targeted US presidential campaigns, says Google
9420 8:46: A number becomes a potent symbol of police brutality
9422 China, Iran-based hackers targeted Trump, Biden campaigns: Google
9424 All Facebook users can now access a tool to port data to Google Photos
9426 Webmail.telis-finanz.de - Webmail
9427 MegaFans and Black Dog Gaming Host Charity eSports Tournament for USO West
9428 France, Germany back European cloud computing 'moonshot'
9430 Google: State-based hackers targeted Trump, Biden campaigns
9431 Google: Foreign hackers targeting both Trump and Biden campaigns
9434 US antitrust probe of Google includes search on Androi

12468 Google takes down viral 'Remove China Apps' app from Play Store for violating guidelines
12475 Eagles' Don Henley asks Congress to change copyright law
12483 Facebook and PayPal invest in Indonesian start-up Gojek | Technology
12491 Facebook and PayPal invest in Indonesian start-up Gojek
12494 Brookfield, June 4, 2020: Support the Black Lives Matter Movement
12495 Google faces $5 billion lawsuit in U.S. for tracking ‘private’ internet use and violating federal wiretapping and California privacy laws
12499 Merkle Launches Performance Marketing Lab to Enable Cross-Channel Experiences through Data, Analytics, and Google Technology
12504 Let your smart phone battery last longer with the budget-friendly HUAWEI Y6p
12506 Merkle Launches Performance Marketing Lab to Enable Cross-Channel Experiences through Data, Analytics, and Google Technology
12507 Facebook and PayPal invest in Indonesian start-up Gojek
12509 Facebook And PayPal Invest In Indonesian Start-up Gojek
12511 Government agr

15049 This Android Wallpaper Bug Can Kinda Brick Your Phone
15050 New initiative helps small businesses get the resources they need to succeed during COVID-19 pandemic
15051 Senators propose COVID-19 contact-tracing privacy bill
15052 Federal judge upholds use of sedative in Arkansas executions
15057 VideoTik Launches May 27. Here are 3 Things You Need to Know
15059 Australian news generated AU$10m in revenue for Google in 2019
15060 [PRNewswire] Megaport Launches its NaaS Platform in France, Providing
15061 Google rejects call for huge Australian media payout
15062 Google rejects call for huge Australian media payout
15063 Scottsdale AZ Local SEO Google Ranking Expert Digital Marketing Service Launched
15068 No place for hate, racism in society: Satya Nadella | World
15073 Android 11 beta accidentally rolled out, here’s what changed
15074 Sony Has Postponed Its PlayStation 5 Reveal Event
15080 The best Google Chromebook for your needs: What are your options?
15090 Senators propose COV

18658 Hackers target Google Docs, Microsoft Sway to steal user credentials
18663 Google postpones Android 11 unveiling amid U.S. protests | WIBQ
18667 Bitcoin Rising, Satoshi Discoveries, & Google Enters the Race: Bad Crypto News of the Week - 1010.team
18671 Google postpones Android 11 unveiling amid U.S. protests
18679 A Common Google Search These Days: 'Know Any Good Games?' 07/01/2020
18682 Local dance studio embraces pandemic with outdoor music video, incorporates Lubbock landmarks - FOX34
18683 Putlocker$.!! MY HERO ACADEMIA: HEROES RISING (2020) Full HD Watch Online Free | Arts & Entertainment | northfulton.com
18685 Android users can share location using Plus Codes in Google Maps
18687 Stadia Pro members to get 6 new free games in June
18690 Google’s new AR camera tool will help you maintain social distance
18701 Pinterest animal – Guam Christian Blog
18703 Google Android 11 announcement delayed - Tech Saper
18704 Google Unveils New Tools To Help Small Businesses During COVID-1

21841 Google considering taking stake in Vodafone Idea - FT
21843 Google launches website to help detect and stop scams
21844 Google Ads Company In India
21848 Google considering taking stake in Vodafone Idea: FT
21849 Google considering taking stake in Vodafone Idea: FT
21850 Google considering taking stake in Vodafone Idea: FT
21851 Google considering taking stake in Vodafone Idea: FT
21852 Google considering taking stake in Vodafone Idea: FT
21853 Google considering taking stake in Vodafone Idea: FT
21854 Google considering taking stake in Vodafone Idea: FT
21855 Facebook and Twitter stocks slip as Trump prepares to sign social media executive order
21857 Google Considering Taking Stake in Vodafone Idea: FT
21858 Arizona Sues Google For Tracking Users' Location Even When They Turned Tracking Off
21859 Google considering taking stake in Vodafone Idea - FT
21864 Google considering taking stake in Vodafone Idea - FT | MarketScreener
21867 US court dismisses anti-conservative bias suit 

24835 Gmails latest update makes it easier to change the look of your inbox – NerdlyNews
24836 OPSO: Two arrested in burglary investigation | KTVE - myarklamiss.com
24839 Sen. Hawley criticizes Google over deletion of critical China comments - Business Insider
24841 Google sees resurgence in state-backed hacking, phishing related to COVID-19 | News | WIN 98.5
24842 Google outlines plan to get some employees back to the office - KJE Business.Com
24843 Twitter, Facebook win appeal over alleged anti-conservative bias - BNN Bloomberg
24844 Google sued by Arizona over location data and alleged 'consumer fraud' - CNET
24849 The FIDO Alliance, backed by Apple and Google, debuts loginwithFIDO․com - 9to5Mac
24851 EBSCO Information Services Supports Google's Campus Activated Subscriber Access (CASA)
24853 Factbox: Where do Trump and Biden stand on tech policy issues?
24856 Synopsys Announces Support of TensorFlow Lite for Microcontrollers on Energy-Efficient ARC EM and ARC HS Processor IP
24859

In [15]:
# Making dictionary of only deduplicated titles
deduplicated = []

for feed in range(len(feeds)):
    if feed not in duplist:
        deduplicated.append(feeds[int(feed)])
len(deduplicated)

18116

In [16]:
# Testing to see if only non-duplicates were copied over with a known duplicate
x = 29
print('For index ' + str(x) + ':\n')
print('Original:\n'+ str(feeds[x]['title']) + '\n')
print('Deduplicated:\n'+ str(deduplicated[x]['title']))

For index 29:

Original:
Former SNL comedian Jay Pharoah says he was racially profiled by police, detained with knee on his neck

Deduplicated:
Facebook tests Wikipedia-powered information panels, similar to Google, in its search results – TechCrunch


In [23]:
with open("/Github/google_deduplicated.json", "w") as data_file:
    for feed in deduplicated:
        line = json.dumps(feed)
        data_file.write(line)
        data_file.write("\n")

In [24]:
# Read the json file back
google_deduplicated=open("/Github/google_deduplicated.json").readlines()

In [25]:
# Reading the count and printing the results

orig = len(google_json)
new = len(google_deduplicated)

print('The original json file had '+ str(orig) +' records\n')
print('The new json file had '+ str(new) +' records\n')
print('There are ' + str(orig-new) + ' fewer records by getting rid of the duplicates')

The original json file had 27798 records

The new json file had 18116 records

There are 9682 fewer records by getting rid of the duplicates
