## Libraries installations

In [1]:
import webhoseio, os
import gensim, operator
from gensim.models import KeyedVectors
import json
from simhash import Simhash, SimhashIndex
import numpy as np

## Word2Vec model

In [2]:
model_path = 'C:/Users/tramh/github/Data-Science-Portfolio/Airlines Covid-19/data/'

In [3]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

In [4]:
model_w2v_AP    = load_wordvec_model('Word2Vec Google News', 'GoogleNews-vectors-negative300.bin.gz', True)

Loading Word2Vec Google News model...
Finished loading Word2Vec Google News model...


## Functions used 

In [5]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [6]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [7]:
def cleanup(input):
    # remove English stopwords
    input = input.replace("'s", " ").replace("n’t", " not").replace("’ve", " have")
    input = re.sub(r'[^a-zA-Z0-9 ]', '', input)
    return input

## Load data 

In [8]:
#Read the JSON file back into Python array of JSON objects and confirm the count
airlines_json=open("C:/Users/tramh/github/Data-Science-Portfolio/Airlines Covid-19/data/Airlines_2.json").readlines()

In [9]:
feeds = []
i = 0
for feed in airlines_json:
    a = json.loads(feed)
    a['id'] = i
    i += 1
    feeds.append(a)

## Simhash Model

In [10]:
# Setting parameters
hamming_distance = 20
w2v_score = 0.7

In [11]:
# Creating simhash model
import logging
logging.getLogger('simhash').setLevel(logging.CRITICAL)

objs = [(str(feed['id']), Simhash(str(feed['title']))) for feed in feeds]
index = SimhashIndex(objs, k=hamming_distance)

In [12]:
# finding duplicates

duplist=[]

for feed in range(len(feeds)):
    if int(feed) not in duplist:
        feed_sel = feeds[feed]
        feed_hash = Simhash(str(feed_sel['title']))
        dup_indices = index.get_near_dups(feed_hash) 
        
        for dupi in dup_indices:
            if int(dupi) not in duplist:
                try:
                    score = calc_similarity(feed_sel['title'], feeds[int(dupi)]['title'], model_w2v_AP)
                except:
                    score = 0
                if score >= w2v_score:
                    if int(dupi) not in duplist:
                        if feeds[int(dupi)]['id'] != feed_sel['id']:
                            duplist.append(feeds[int(dupi)]['id'])                     


In [13]:
#Print out results and comparison 
print('The original dataset is ' + str(len(feeds)) + ' values')
print('The number of duplicates is ' + str(len(duplist)))
print('The dataset has ' + str(round((len(duplist)/len(feeds)*100),4)) + '% duplicates')

The original dataset is 20015 values
The number of duplicates is 6674
The dataset has 33.345% duplicates


## Testing deduplicated results

In [14]:
# Testing if its only pulling out dupes
for x in sorted(duplist):
    print(feeds[x]['id'],feeds[x]['title'])

9 Turkish Airlines Boeing 777-F
12 Turkmenistan Airlines Boeing 777-200LR
13 Asia Pacific Airlines (Guam) Boeing 727-200F
14 Turkish Airlines Boeing 777-F
15 American Airlines will resume booking flights to capacity, as COVID-19 cases soar
16 Global Airlines Launch Probes Against Pakistani Pilots
23 PIA, Pakistan’s national airline, has grounded a third of its pilots for having fake licenses – CNN
39 Airlines Selling the Middle Seat Again
44 Major U.S. Airlines Announce Health Acknowledgment Requirement
45 Southwest Airlines stock to jump 47%, Goldman Sachs gives 'buy' rating - Business Insider
67 Thinking about trading options or stock in Nikola Corp, Plug Power, General Electric, Facebook, or United Airlines?
71 Southwest Airlines (NYSE:LUV) Upgraded by Goldman Sachs Group to Buy
75 American Airlines will resume booking flights to capacity, as COVID-19 cases soar
84 New Mexico Educational Retirement Board Sells 7,900 Shares of Southwest Airlines Co (NYSE:LUV)
101 Porter Airlines exte

2819 American Airlines says it’s overstaffed by 20,000 employees for fall schedule
2823 US Senator blasts American Airlines for packing the middle seats on his flight - CNN
2825 Toys Airline Onahole - EUR 13
2826 US Senator blasts American Airlines for packing the middle seats on his flight
2829 Coronavirus lockdown impact: Air France and sister airline to cut 7,580 jobs
2830 EU safety agency suspends Pakistani airlines’ European authorisation
2831 US Senator blasts American Airlines for packing the middle seats on his flight
2833 Alaska Airlines plans to give passengers yellow cards for refusing to wear face masks
2836 Man is kicked off a Spirit Airlines flight to Florida for refusing to wear a face mask
2838 Sen. Jeff Merkley blasts American Airlines for packing the middle seats on his plane
2839 Technology that once cleaned sports equipment helps airline industry take off
2841 Can airlines make passengers wear masks?
2842 US Senator blasts American Airlines for packing the middle se

5434 OE-LWH EMBRAER ERJ-195LR (190-200LR) Austrian Airlines @ M…
5436 American Airlines and United Airlines Cancel Their Hong Kong Flights Due To Crew Testing Requirement
5438 A321neo | Middle East Airlines Air Liban
5439 Bjorn’s Corner: Do I get COVID in airline cabins? Part 10. Trans-Atlantic trip. - Leeham News and Analysis
5442 Thinking about trading options or stock in Veritone Inc, Six Flags Entertainment, Facebook, Vivint Solar, or Southwest Airlines?
5443 US bans Pakistan International Airlines flights over pilot concerns
5444 Austrian Airlines Cancellation Policy
5448 EgyptAir, Ethiopian Airlines resume flights to Dubai
5450 SE-RJF Airbus A320-232 SAS Scandinavian Airlines @ MAN/EGC…
5452 N349AN Boeing 767-323ER (W) American Airlines @ MAN/EGCC 1…
5453 American Airlines Group Announces Webcast of Second-Quarter 2020 Financial Results
5455 Several Airlines Have Cancelled Agreements With Their Contract Lounges, Remember To Carry Your Priority Pass
5456 The Airline Bailout Loopho

10045 Alaska Airlines adds 12 new destinations in 2020 from LAX
10047 China Southern Airlines Boeing 737-81B(WL) B-5340
10049 Deputies: 3 women attacked airline workers over flight delay
10050 American Airlines Calls to Extend Billions in Aid Through 2021
10051 Brasada Capital Management LP Acquires New Position in Southwest Airlines Co (NYSE:LUV)
10053 American Airlines and JetBlue are trying to corner the market in the Northeast in an unexpected new partnership (AAL, JBLU)
10054 Deputies: 3 women attacked airline workers over flight delay
10055 Deputies: 3 women attacked airline workers over flight delay
10057 Key U.S. lawmakers back unions call for new airline bailout
10058 Key U.S. lawmakers back unions call for new airline bailout
10059 Key U.S. lawmakers back unions call for new airline bailout
10060 9,798 Shares in Southwest Airlines Co (NYSE:LUV) Acquired by Icon Wealth Partners LLC
10063 Key U.S. lawmakers back unions call for new airline bailout
10064 Key U.S. lawmakers back 

14455 United Airlines' Mask Mandate Expands to Areas in Airports
14457 Orient Thai Airlines | Boeing 737-3J6 | HS-BRL | VHHH/HKG
14459 United Airlines to require passengers to wear masks in airports
14461 Ethiopian Airlines cargo plane catches fire at Shanghai airport, no casualties
14463 United Airlines Defers Plane Deliveries To Beyond 2022 As Air Travel Remains Muted
14465 Airlines for America Applauds EPA Proposal to Adopt ICAO Aircraft Emissions Standard
14467 Taiwan parliament passes proposal to rebrand China Airlines
14468 United Airlines' revenue dropped 87% in second quarter
14470 United Airlines Defers Plane Deliveries To Beyond 2022 As Air Travel Remains Muted
14471 Spirit Airlines Airbus 320-232 - NKS928 | Taxiing for depart…
14472 United Airlines reports $1.6bn loss
14473 United Airlines Defers Plane Deliveries To Beyond 2022 As Air Travel Remains Muted
14474 Company Profile for Airlines Reporting Corporation
14475 A321-253NX, American Airlines, F-WZMJ, N419AN (MSN 10017)


17292 Majority of U.S. House backs new bailout for U.S. passenger airlines
17293 36,150 Shares in Southwest Airlines Co (NYSE:LUV) Purchased by Mount Vernon Associates Inc. MD
17294 Majority of U.S. House backs new bailout for U.S. passenger airlines
17295 Majority of U.S. House backs new bailout for U.S. passenger airlines | News | i92.9
17296 Majority of U.S. House backs new bailout for U.S. passenger airlines
17297 Budget airline AirAsia’s future in ‘significant doubt’
17298 United Airlines to lay off 556 in Tampa, Orlando
17299 FLIGHT MH17 DEJA VU: CIA LIKELY PLOTTING JULY 29, 2020, RUSSIAN MISSILE STRIKE TARGETING COMMERCIAL AND/OR MILITARY AIRCRAFT SPECIFICALLY TO TRIGGER WORLD WAR III (JULY 27, 2020): CIA Headquarters Located Beneath CERN at Lake Geneva in Switzerland Plotting Russian Missile-Based Attack Targeting Commercial and/or Military Aircraft on July 29, 2020, Exactly 2,204-Days After CIA Staged Alleged Russian Missile Strike Targeting Malaysian Airlines Flight MH17 Back

In [15]:
# Making dictionary of only deduplicated titles
deduplicated = []

for feed in range(len(feeds)):
    if feed not in duplist:
        deduplicated.append(feeds[int(feed)])
len(deduplicated)

13341

In [16]:
with open("C:/Users/tramh/github/Data-Science-Portfolio/Airlines Covid-19/data/Airlines_dedup.json", "w") as data_file:
    for feed in deduplicated:
        line = json.dumps(feed)
        data_file.write(line)
        data_file.write("\n")

## Read the deduplicated file 

In [17]:
# Read the json file back
airlines_deduplicated=open("C:/Users/tramh/github/Data-Science-Portfolio/Airlines Covid-19/data/Airlines_dedup.json").readlines()

In [19]:
# Reading the count and printing the results

orig = len(airlines_json)
new = len(airlines_deduplicated)

print('The original json file had '+ str(orig) +' records\n')
print('The new json file had '+ str(new) +' records\n')
print('There are ' + str(orig-new) + ' fewer records by getting rid of the duplicates')

The original json file had 20015 records

The new json file had 13341 records

There are 6674 fewer records by getting rid of the duplicates
