In [1]:
# So, we want to be able to classify articles into tech and non-tech using K-NN.
# How will we go about this? (I think I might go for Daily Mail / Guardian instead)

# Additional: If anyone else is reading this, apologies - it could be neater. I wasn't
# expecting to have quite so much fun and go so far with it, hence why it's now 2:20am.

In [2]:
# We'll attempt the following steps:
# 1. Download a relevant corpus - pick a new website and extract two sets of article: tech and sport.
# 2. Represent each article as a vector of the 25 most important words in an article.
# 3. The distance between articles is calculated using the number of words that they have in common
# 4. Find the K-Nearest Neighbours and carry out a majority vote.

In [3]:
# This may get a bit hefty for Notepad, but we'll see.
# We'll control the counts from here:
article_count = 30
word_count    = 10
num_k         = 7

In [4]:
from urllib.request import urlopen
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from bs4 import BeautifulSoup
import re

In [5]:
ignore_list = set( stopwords.words('english')
                  + list(punctuation) 
                  + ['’',"'s","'it","'the","‘","'i","n't",'“','”','–','–','•','…'] 
                  + ['i','we','one','two','1','2','3'])

In [6]:
def count_words( text, ignore_list ):
    word_freq = defaultdict(int)
    for word in word_tokenize(text):
        word_lc = word.lower();
        if word_lc not in ignore_list and word not in ignore_list:
            word_freq[ word_lc ] += 1
            #if word_freq[word_lc] > 2:
                #print('       word at {:3d}: {}'.format(word_freq[word_lc], word_lc) )
    return word_freq

In [7]:
def top_frequencies( n, freq_list, max_cut, min_cut ):
    freq_edit = dict( freq_list )
    max_freq  = float( max(freq_list.values()) )
    
    for word in freq_list.keys():
        word_freq = freq_list[word] / max_freq
        if word_freq >= max_cut or word_freq <= min_cut:
            del freq_edit[ word ]
    #print( freq_edit )
    return nlargest( n, freq_edit, key=freq_edit.get )

In [8]:
def crawlDailyMail( count, wordcount, ignore_list ):
    homepage      = 'http://www.dailymail.co.uk/home/index.html';
    home_content  = urlopen( homepage ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('/news/article-') }) ]
    unique_links  = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    print( "%d unique links found, using %d" % (unique_links.__len__(), count) )
    link_list     = list( unique_links )
    
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            article_content = urlopen( 'http://www.dailymail.co.uk' + article_href ).read().decode('utf8')
            article_soup    = BeautifulSoup( article_content, 'html.parser' )
            article_body    = article_soup.find_all('div', {'itemprop':'articleBody'})
            body_paragraphs = [ a.find_all('p', {'class':'mol-para-with-font'}, recursive=False) for a in article_body ]
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )
            
            article_words.append({ 'source':"daily mail", 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words
        
print("Crawling the DM")
dm_vectors = crawlDailyMail( article_count, word_count, ignore_list )
#print( dm_vectors )

Crawling the DM
159 unique links found, using 30
    1: /news/article-5289475/Finding-Stourhead-Manors-secret-garden.html
    2: /news/article-5286477/British-author-Peter-Mayle-dies-aged-78-illness.html
ERROR: /news/article-5289171/Met-Office-issue-amber-alert-FOOT-snow-tonight.html
    3: /news/article-5288985/Westminster-survivor-told-drink-warm-milk-sleep.html
    4: /news/article-5286639/Bill-Melinda-Gates-jumper-daughter-lives-up.html
    5: /news/article-5288643/Pilot-denies-murdering-estranged-nail-technician-wife.html
    6: /news/article-5288409/Jewellery-designer-pleads-gangster-ring-return.html
    7: /news/article-5290787/Boy-15-posed-head-CIA-secret-files.html
    8: /news/article-5290299/Why-DJ-Kid-Jensen-revealed-Parkinsons.html
    9: /news/article-5288273/Russian-veteran-builds-Iron-Man-suit-fight-bailiffs.html
   10: /news/article-5289729/Woman-SPRAYS-mother-nine-year-old-girl-acid.html
   11: /news/article-5290233/Hilarious-video-shows-cat-overwhelmed-plastic-bags.h

In [9]:
def crawlGuardian( count, wordcount, ignore_list ):
    homepage      = 'https://www.theguardian.com/uk-news';
    home_content  = urlopen( homepage ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('/\d{4}/[a-z]{3}/\d{1,2}/') }) ]
    unique_links  = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    print( "%d unique links found, using %d" % (unique_links.__len__(), count) )
    link_list     = list( unique_links )

    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            display_href    = article_href.replace('https://www.theguardian.com/', '')
            article_content = urlopen( article_href ).read().decode('utf8')
            article_soup    = BeautifulSoup( article_content, 'html.parser' )
            article_body    = article_soup.find_all('div', {'itemprop':'articleBody'})
            body_paragraphs = [ a.find_all('p', recursive=False) for a in article_body ]
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )

            article_words.append({ 'source': 'guardian', 'words': tuple(top_words), 'href': display_href })
            processed += 1
            
        except:
            print('ERROR: ' + display_href)
            continue
        print( "{:5d}: {}".format(processed, display_href) )
        if processed >= count:
            break
            
    return article_words
    
print("Crawling the Guardian")
gd_vectors = crawlGuardian( article_count, word_count, ignore_list )
#print( gd_vectors )    

Crawling the Guardian
57 unique links found, using 30
    1: politics/2018/jan/19/a-garden-bridge-over-troubled-water
    2: law/2018/jan/19/oxford-student-case-oliver-mears-dropped-days-before-trial
    3: film/2018/jan/19/paddington-2-becomes-best-reviewed-film-ever
    4: business/2018/jan/19/carillion-collapse-expected-to-further-delay-building-at-two-major-hospitals
ERROR: politics/video/2018/jan/16/has-this-mp-fallen-asleep-during-ken-clarkes-speech-video-desmond-swayne
    5: politics/2018/jan/19/corbyn-aide-david-prescott-lost-gmb-backing-after-behaviour-allegations
    6: uk-news/2018/jan/19/football-agent-peter-morrison-jailed-death-dangerous-driving
    7: uk-news/2018/jan/19/woman-jailed-after-backing-out-of-suicide-pact-with-man
    8: business/2018/jan/15/the-four-contracts-that-finished-carillion-public-private-partnership
    9: society/2018/jan/19/esther-mcvey-makes-disability-benefits-u-turn-over-payments
   10: uk-news/2018/jan/18/blackpool-beach-year-life-seaside-to

In [10]:
def crawlIndependent( count, wordcount, ignorelist ):
    homepage      = 'http://www.independent.co.uk/';
    home_content  = urlopen( homepage ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{5,}.html') }) ]
    unique_links  = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    print( "%d unique links found, using %d" % (unique_links.__len__(), count) )
    link_list     = list( unique_links )
    
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            article_content = urlopen( 'http://www.independent.co.uk' + article_href ).read().decode('utf8')
            article_soup    = BeautifulSoup( article_content, 'html.parser' )
            article_body    = article_soup.find_all('div', {'itemprop':'articleBody'})
            body_paragraphs = [ a.find_all('p', recursive=False) for a in article_body ]
            if not body_paragraphs:
                raise ValueError('No text to analyze')
            
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )

            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )

            article_words.append({ 'source': 'independent', 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words

print('Crawling The Independent')
id_vectors = crawlIndependent( article_count, word_count, ignore_list )
#print( id_vectors )

Crawling The Independent
54 unique links found, using 30
    1: /news/uk/home-news/grenfell-tower-fire-latest-updates-kensington-council-public-meetings-humiliated-residents-anger-a8167636.html
    2: /news/world/americas/donald-trump-colin-kaepernick-sportspeople-american-football-baseball-college-puerto-rico-a8167671.html
    3: /news/world/asia/antelopes-saiga-population-why-extinct-mystery-solved-a8166966.html
    4: /news/world/europe/catalan-independence-catalonia-parliament-carles-puigdemont-spain-direct-rule-a8164451.html
    5: /news/uk/crime/oliver-mears-rape-case-surrey-police-cleared-lack-evidence-disclosure-oxford-university-student-a8168006.html
    6: /news/world/americas/us-politics/james-comey-former-fbi-director-ethical-leadership-class-william-and-mary-college-williamsburg-a8168731.html
    7: /news/world/americas/turpin-family-latest-what-happened-siblings-children-perris-neighbourhood-california-a8165081.html
    8: /news/uk/crime/john-worboys-latest-updates-prison

In [14]:
def crawlExpress( count, wordcount, ignorelist ):
    homepage      = 'https://www.express.co.uk/news';
    home_content  = urlopen( homepage ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('^/news/[a-z]*/\d*/') }) ]
    unique_links  = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    print( "%d unique links found, using %d" % (unique_links.__len__(), count) )
    link_list     = list( unique_links )
    
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            article_content = urlopen( 'https://www.express.co.uk' + article_href ).read().decode('utf8')
            article_soup    = BeautifulSoup( article_content, 'html.parser' )
            article_body    = article_soup.find_all('section', {'class':'text-description'})
            body_paragraphs = [ a.find_all('p', recursive=False) for a in article_body ]
            if not body_paragraphs:
                raise ValueError('No text to analyze')
            
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )

            article_words.append({ 'source': 'express', 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words

print('Crawling The Express')
ex_vectors = crawlExpress( article_count, word_count, ignore_list )
#print( ex_vectors )

Crawling The Express
126 unique links found, using 30
    1: /news/politics/907230/Emmanuel-Macron-Theresa-May-UK-France-summit-Sandhurst-Oscars-selfie
    2: /news/uk/906996/Jacob-Rees-Mogg-Brexit-news-House-of-Lords-EU-withdrawal-bill
    3: /news/politics/907163/Jacob-Rees-Mogg-Theresa-May-Brexit-EU-divorce-Remoaners-House-of-Lords-bill
    4: /news/uk/907290/Summer-holidays-2018-summer-break-school-holidays
    5: /news/nature/907161/big-cats-bbc-one-bay-cat-rare-endangered-species-caught-on-camera-borneo-jungle
    6: /news/uk/907275/NHS-botched-care-Jeanette-Bailey-nurse-died-weeks-agony-angina-heart-troubles
    7: /news/uk/907184/Romania-EU-prostitutes-trafficked-Redbridge-police-close-one-brothel-per-week
    8: /news/world/907131/North-Korea-latest-update-World-War-3-US-Donald-Trump-military-Kim-Jong-un-nuclear-video
    9: /news/uk/907247/Brexit-news-latest-update-UK-France-EU-European-Union-Theresa-May-Emmanuel-Macron-video
   10: /news/royal/907158/Meghan-Markle-royal-wedd

In [15]:
class VectorSpace:
    
    def __init__( self, training_data, ignore_list ):
        self._training_data = training_data
        self._ignore_list   = ignore_list
    
    def _get_class_pc( self ):
        pass
    
    def test( self, text, wordcount, k ):
        neighbours = defaultdict(int)
        word_freq = count_words( text, self._ignore_list )
        test_set  = set( word_freq )
        
        for article in self._training_data:
            neighbours[ (article['source'], article['href']) ] = len( set(article['words']).intersection(test_set) )
        
        k_nearest = nlargest( k, neighbours, key=neighbours.get )
        classifications = defaultdict(int)
        for neighbour in k_nearest:
            classifications[ neighbour[0] ] += 1
        
        return [ (k, classifications[k]) for k in sorted(classifications, key=classifications.get, reverse=True) ]
        
training_data = dm_vectors + gd_vectors + id_vectors + ex_vectors
vs = VectorSpace( training_data, ignore_list )

In [16]:
# Let's test the BBC!
bbc_k = 11
count = 20 

print('Getting BBC links...')
bbc_home  = urlopen('http://www.bbc.com/news').read().decode('utf8');
bbc_soup  = BeautifulSoup(bbc_home, 'html.parser')
bbc_links = [ a['href'] for a in bbc_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{6,}$') }) ]

unique_links  = set([ re.sub('#(.*)$', '', href) for href in bbc_links ])
print( "%d unique links found, using %d\n" % (unique_links.__len__(), count) )
link_list     = list( unique_links )[:count]

for test_link in link_list:
    print('Testing ' + test_link )
    test_content = urlopen('http://www.bbc.com' + test_link).read().decode('utf8');
    test_soup    = BeautifulSoup( test_content, 'html.parser' )
    print( test_soup.title.text )

    test_body    = test_soup.find('div', {'property':'articleBody'})
    if not test_body:
        print('No test body found, continuing...\n')
        continue
    test_para    = test_body.find_all('p', recursive=False )
    if not test_para:
        print('No paragraphs found, continuing...\n')
        continue
    
    test_text   = ' '.join([ p.text for p in test_para ])
    test_result = vs.test(test_text, word_count, bbc_k) 
    print(", ".join([ '{}% {}'.format(int(c[1]/bbc_k*100), c[0]) for c in test_result ]))
    print('')
    


Getting BBC links...
37 unique links found, using 20

Testing /news/world-europe-42750083
President Putin plunges into icy water to mark Epiphany - BBC News
No test body found, continuing...

Testing /news/world-us-canada-42754289
Las Vegas shooting: Police say gunman's girlfriend won't face charges - BBC News
36% daily mail, 27% express, 27% independent, 9% guardian

Testing /news/world-us-canada-39085062
US defence spending v rest of world - BBC News
No test body found, continuing...

Testing /news/technology-42746772
Data-stealing spyware 'traced to Lebanon' - BBC News
63% daily mail, 18% express, 9% guardian, 9% independent

Testing /news/entertainment-arts-42745150
Michael Douglas: Former employee accuses actor of sexual harassment - BBC News
36% daily mail, 36% express, 27% guardian

Testing /news/world-us-canada-42753400
US faces growing threat from revisionist powers - Mattis - BBC News
No test body found, continuing...

Testing /news/technology-42742168
The future of high-spee