In [1]:
# So, we want to be able to classify articles into tech and non-tech using K-NN.
# How will we go about this? (I think I might go for Daily Mail / Guardian instead)

# Additional: If anyone else is reading this, apologies - it could be neater. I wasn't
# expecting to have quite so much fun and go so far with it, hence why it's now 2:20am.

In [2]:
# We'll attempt the following steps:
# 1. Download a relevant corpus - pick a new website and extract two sets of article: tech and sport.
# 2. Represent each article as a vector of the 25 most important words in an article.
# 3. The distance between articles is calculated using the number of words that they have in common
# 4. Find the K-Nearest Neighbours and carry out a majority vote.

In [3]:
# This may get a bit hefty for Notepad, but we'll see.
# We'll control the counts from here:
article_count = 50
word_count    = 25
num_k         = 7

In [4]:
from urllib.request import urlopen
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from bs4 import BeautifulSoup
import re

In [5]:
ignore_list = set( stopwords.words('english')
                  + list(punctuation) 
                  + ['’',"'s","'it","'the","‘","'i","n't",'“','”','–','–','•','…','—'] 
                  + ['i','we','one','two','1','2','3'])

In [6]:
def extract_articles( link_prefix, link_list, extract_function, source_name, count ):
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            body_paragraphs = extract_function( link_prefix, article_href )
            if not body_paragraphs:
                raise ValueError('No body content found')
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( word_count, word_freq, 0.9, 0.1 )
            
            article_words.append({ 'source':source_name, 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            #raise
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words

In [7]:
def count_words( text, ignore_list ):
    word_freq = defaultdict(int)
    for word in word_tokenize(text):
        word_lc = word.lower();
        if word_lc not in ignore_list and word not in ignore_list:
            word_freq[ word_lc ] += 1
            #if word_freq[word_lc] > 2:
                #print('       word at {:3d}: {}'.format(word_freq[word_lc], word_lc) )
    return word_freq

In [8]:
def top_frequencies( n, freq_list, max_cut, min_cut ):
    freq_edit = dict( freq_list )
    max_freq  = float( max(freq_list.values()) )
    
    for word in freq_list.keys():
        word_freq = freq_list[word] / max_freq
        if word_freq >= max_cut or word_freq <= min_cut:
            del freq_edit[ word ]
    #print( freq_edit )
    return nlargest( n, freq_edit, key=freq_edit.get )

In [9]:
def process_links( article_links, count ):
    unique_links = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    use_total    = min( count, unique_links.__len__() )
    print( "%d unique links found, trying to use using %d" % (unique_links.__len__(), use_total) )
    return list( unique_links )    

In [10]:
def soup_up( url ):
    home_content  = urlopen( url ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    return home_soup

In [11]:
def crawlDailyMail( count, wordcount, ignore_list ):
    home_soup     = soup_up('http://www.dailymail.co.uk/home/index.html');
    article_links = [ a['href'] for a in home_soup.find_all('a', {'href': re.compile('/news/article-')}) ]
    link_list     = process_links( article_links, count )
    
    def extract_content( link_prefix, article_href ):
        article_soup = soup_up( link_prefix + article_href )
        article_body = article_soup.find_all('div', {'itemprop':'articleBody'})
        return [ a.find_all('p', {'class':'mol-para-with-font'}, recursive=False) for a in article_body ]
    
    return extract_articles(  'http://www.dailymail.co.uk', link_list, extract_content, 'daily mail', count )
        
print("Crawling the DM")
dm_vectors = crawlDailyMail( article_count, word_count, ignore_list )
#print( dm_vectors )

Crawling the DM
121 unique links found, trying to use using 50
    1: /news/article-5291015/Rape-case-collapses-prevent-getting-justice.html
    2: /news/article-5290905/Trump-asked-Marla-Maples-abort-Tiffany.html
    3: /news/article-5288715/Colin-Firth-says-NEVER-work-Woody-Allen-again.html
    4: /news/article-5282823/New-test-detect-STI-two-cent-Australians-have.html
    5: /news/article-5288221/Gauke-says-NOT-launch-judicial-review-Worboys.html
    6: /news/article-5291883/Topless-felons-revealed-extraordinary-mugshots.html
    7: /news/article-5291597/Russia-probes-Columbine-link-three-school-massacres.html
    8: /news/article-5291561/Ukip-leaders-estranged-wife-slams-husband-mistress.html
    9: /news/article-5279689/James-Franco-accuser-forced-FBI-death-threat.html
   10: /news/article-5288547/Michael-Douglas-accuser-speaks-TV-interview.html
   11: /news/article-5290983/Smart-motorway-horror-sees-lorry-crash-familys-car.html
   12: /news/article-5291537/Brazils-Sand-King-Marci

In [12]:
def crawlGuardian( count, wordcount, ignore_list ):
    home_soup     = soup_up('https://www.theguardian.com/uk-news');
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('/\d{4}/[a-z]{3}/\d{1,2}/') }) ]
    link_list     = process_links( article_links, count )
    
    link_list = list(map( lambda href: href.replace('https://www.theguardian.com', ''), link_list ))
    def extract_content( link_prefix, article_href ):
        article_soup = soup_up( link_prefix + article_href )
        article_body = article_soup.find_all('div', {'itemprop':'articleBody'})
        return [ a.find_all('p', recursive=False) for a in article_body ]
    
    return extract_articles( 'https://www.theguardian.com', link_list, extract_content, 'guardian', count )
    
print("Crawling the Guardian")
gd_vectors = crawlGuardian( article_count, word_count, ignore_list )
#print( gd_vectors )    

Crawling the Guardian
52 unique links found, trying to use using 50
    1: /uk-news/2018/jan/17/man-convicted-of-theft-in-1976-cleared-after-googling-his-arresting-officer
    2: /uk-news/2018/jan/20/dixons-carphone-ceo-sebastian-james-will-step-down-to-run-chemist-chain-boots
    3: /us-news/2018/jan/19/donald-trump-and-theresa-may-to-meet-in-switzerland
    4: /politics/2018/jan/20/jewish-labour-group-accuses-failing-act-antisemitism
    5: /politics/2018/jan/19/boris-johnson-channel-bridge-france-emmanuel-macron
    6: /politics/2018/jan/09/theresa-may-learns-cabinet-reshuffle-all-pain-and-no-gain
ERROR: /weather/video/2018/jan/18/fierce-winds-fell-trees-and-cause-travel-chaos-across-the-uk-video
    7: /politics/2018/jan/19/trump-russia-inquiry-is-told-nigel-farage-may-have-given-julian-assange-data
    8: /theguardian/2018/jan/20/alexis-jay-ive-never-needed-therapy-i-did-get-angry-in-rotherham-though
    9: /world/2018/jan/20/britons-in-jamaicas-montego-bay-urged-to-stay-in-resort

In [13]:
def crawlIndependent( count, wordcount, ignorelist ):
    home_soup     = soup_up('http://www.independent.co.uk/');
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{5,}.html') }) ]
    link_list     = process_links( article_links, count )
   
    def extract_content( link_prefix, article_href ):
        article_soup = soup_up( link_prefix + article_href )
        article_body = article_soup.find_all('div', {'itemprop':'articleBody'})
        return [ a.find_all('p', recursive=False) for a in article_body ]
    
    return extract_articles(  'http://www.independent.co.uk', link_list, extract_content, 'independent', count )

print('Crawling The Independent')
id_vectors = crawlIndependent( article_count, word_count, ignore_list )
#print( id_vectors )

Crawling The Independent
54 unique links found, trying to use using 50
    1: /news/world/americas/donald-trump-colin-kaepernick-sportspeople-american-football-baseball-college-puerto-rico-a8167671.html
    2: /news/world/americas/turpin-family-latest-charges-parents-torture-imprisonment-children-david-louise-news-updates-a8166881.html
    3: /news/long_reads/france-autism-treatment-care-support-french-healthcare-a8161416.html
    4: /news/world/europe/spain-catalonia-direct-rule-carles-puigdesmont-brussels-govern-remote-belgium-mariano-rajoy-madrid-a8159801.html
    5: /news/world/americas/las-vegas-shooting-latest-child-abuse-images-stephen-paddock-fbi-marilou-danley-a8169491.html
    6: /news/long_reads/hunter-s-thompson-death-suicide-kill-himself-how-die-gonzo-journalism-warren-hinckle-a8161841.html
    7: /news/world/americas/aly-raisman-jordyn-wieber-larry-nassar-sexual-assault-you-are-pathetic-gmynastics-trial-latest-a8169046.html
    8: /news/world/americas/donald-trump-first-y

In [14]:
def crawlExpress( count, wordcount, ignorelist ):
    home_soup     = soup_up('https://www.express.co.uk/news' );
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('^/news/[a-z]*/\d*/') }) ]
    link_list     = process_links( article_links, count )
    
    def extract_content( link_prefix, article_href ):
        article_soup = soup_up( link_prefix + article_href )
        article_body = article_soup.find_all('section', {'class':'text-description'})
        return [ a.find_all('p', recursive=False) for a in article_body ]
    
    return extract_articles( 'http://www.express.co.uk', link_list, extract_content, 'express', count )

print('Crawling The Express')
ex_vectors = crawlExpress( article_count, word_count, ignore_list )
#print( ex_vectors )

Crawling The Express
126 unique links found, trying to use using 50
    2: /news/politics/906957/theresa-may-emmanuel-macron-selfie-oscar-calais-deal
    3: /news/world/907239/China-kim-jong-un-trump-xi-trade-war-world-war-3-trade-deficit
    4: /news/nature/907082/Whale-CRISIS-Mystery-as-no-baby-northern-right-whales-seen-this-winter
    5: /news/world/907104/north-korea-news-japan-threat-kim-jong-un-abducted-Kazuhiro-Araki-world-war-3-ICC
    6: /news/science/907166/ai-predict-when-you-will-die
    7: /news/obituaries/907423/Peter-Mayle-dead-obituary-A-Year-In-Provence
    8: /news/world/907160/christian-whipped-selling-alcohol-sharia-law-indonesia
    9: /news/world/907328/Government-shutdown-US-Donald-Trump-Senate-vote-dreamers-funding-bill-wall
   10: /news/uk/907255/Horror-woman-and-child-sprayed-noxious-substance-acid-London
   11: /news/uk/907381/Dog-travelled-train-alone-Birmingham-New-Street-changed-Huddersfield-missing-appeal
   12: /news/world/907343/russia-vladimir-putin-s

In [15]:
def crawlTelegraph( count, wordcount, ignorelist ):
    home_soup     = soup_up('http://www.telegraph.co.uk/news/');
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('^/news/\d{4}/') }) ]
    link_list     = process_links( article_links, count )

    def extract_content( link_prefix, article_href ):
        article_soup = soup_up( link_prefix + article_href )
        premium_wall = article_soup.find('div', {'class': 'premium-paywall'})
        if premium_wall:
            raise Exception('Premium Article')
        article_body = article_soup.find_all('div', {'class':'article-body-text'})
        article_para = [ body.find_all('p') for body in article_body ]
        return article_para

    return extract_articles( 'http://www.telegraph.co.uk', link_list, extract_content, 'telegraph', count )

print('Crawling The Telegraph')
tg_vectors = crawlTelegraph( article_count, word_count, ignore_list )
#print( tg_vectors )

Crawling The Telegraph
66 unique links found, trying to use using 50
ERROR: /news/2018/01/13/army-recruitment-delayed-because13bn-computer-system-not-working/
    1: /news/2018/01/19/australia-pub-reopens-isolated-ghost-town-population-zero/
    2: /news/2018/01/18/british-quadriplegic-has-microchip-embedded-body-give-independence/
    3: /news/2018/01/18/bird-flu-prevention-zone-declared-across-england-farmers-called/
    4: /news/2018/01/19/wife-scotlands-chief-constable-hits-snp-minister-fightback-bullying/
    5: /news/2018/01/19/democratic-leader-meets-donald-trump-crisis-talks-us-federal/
ERROR: /news/2017/12/19/fbi-should-investigate-doping-sport-experts-says-justin-gatlin/
    6: /news/2018/01/19/oxford-student-cleared-rape-two-year-ordeal-prompts-police-force/
    7: /news/2018/01/19/gambias-former-dictator-could-extradited/
    8: /news/2018/01/19/raf-fears-lake-district-zip-wires-would-fighter-jets/
    9: /news/2018/01/20/thousands-march-washington-donald-trump-supported-an

In [16]:
class VectorSpace:
    
    def __init__( self, training_data, ignore_list ):
        self._training_data = training_data
        self._ignore_list   = ignore_list
    
    def _get_class_pc( self ):
        pass
    
    def test( self, text, wordcount, k ):
        neighbours = defaultdict(int)
        word_freq  = count_words( text, self._ignore_list )
        test_set   = set( word_freq )
        
        for article in self._training_data:
            neighbours[ (article['source'], article['href']) ] = len( set(article['words']).intersection(test_set) )
        
        k_nearest       = nlargest( k, neighbours, key=neighbours.get )
        classifications = defaultdict(int)
        for neighbour in k_nearest:
            classifications[ neighbour[0] ] += 1
        
        return [ (k, classifications[k]) for k in sorted(classifications, key=classifications.get, reverse=True) ]
        
training_data = dm_vectors + gd_vectors + id_vectors + ex_vectors + tg_vectors
vs = VectorSpace( training_data, ignore_list )

In [17]:
# Let's test the BBC!
bbc_k = 7
count = 25 

print('Getting BBC links...')
bbc_soup  = soup_up('http://www.bbc.com/news')
bbc_links = [ a['href'] for a in bbc_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{6,}$') }) ]
link_list = process_links( bbc_links, count )

processed = 0;
for test_link in link_list:
    print('Testing ' + test_link )
    test_soup = soup_up('http://www.bbc.com' + test_link)
    print( test_soup.title.text )

    test_body    = test_soup.find('div', {'property':'articleBody'})
    if not test_body:
        count += 1
        print(' -- No test body found, continuing...\n')
        continue
    test_para    = test_body.find_all('p', recursive=False )
    if not test_para:
        print(' -- No paragraphs found, continuing...\n')
        continue
    
    test_text   = ' '.join([ p.text for p in test_para ])
    test_result = vs.test(test_text, word_count, bbc_k) 
    print(", ".join([ '{}% {}'.format(int(c[1]/bbc_k*100), c[0]) for c in test_result ]))
    print('')
    
    processed += 1
    if( processed >= count ):
        break
    


Getting BBC links...
41 unique links found, trying to use using 25
Testing /news/business-22434141
Entrepreneurship - BBC News
 -- No test body found, continuing...

Testing /news/science-environment-42747272
Nasa removes US astronaut from ISS mission - BBC News
57% independent, 28% guardian, 14% daily mail

Testing /news/world-us-canada-42653793
Ten ways Trump has changed America - BBC News
71% independent, 14% express, 14% telegraph

Testing /news/world-us-canada-42754607
Nassar case: Gold medallists Raisman and Wieber face abuser - BBC News
 -- No test body found, continuing...

Testing /news/world-asia-42758184
Asian wildlife trafficking 'kingpin' Boonchai Bach arrested - BBC News
57% telegraph, 14% independent, 14% guardian, 14% daily mail

Testing /news/entertainment-arts-42746612
Profoundly deaf Maisie Sly is the star of a short film which may be nominated for an Oscar - BBC News
 -- No test body found, continuing...

Testing /news/entertainment-arts-42746617
Pad Man: A man's 'p