In [1]:
# So, we want to be able to classify articles into tech and non-tech using K-NN.
# How will we go about this? (I think I might go for Daily Mail / Guardian instead)

# Additional: If anyone else is reading this, apologies - it could be neater. I wasn't
# expecting to have quite so much fun and go so far with it, hence why it's now 2:20am.

In [2]:
# We'll attempt the following steps:
# 1. Download a relevant corpus - pick a new website and extract two sets of article: tech and sport.
# 2. Represent each article as a vector of the 25 most important words in an article.
# 3. The distance between articles is calculated using the number of words that they have in common
# 4. Find the K-Nearest Neighbours and carry out a majority vote.

In [3]:
# This may get a bit hefty for Notepad, but we'll see.
# We'll control the counts from here:
article_count = 40
word_count    = 25
num_k         = 7

In [4]:
from urllib.request import urlopen
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from bs4 import BeautifulSoup
import re

In [5]:
ignore_list = set( stopwords.words('english')
                  + list(punctuation) 
                  + ['’',"'s","'it","'the","‘","'i","n't",'“','”','–','–','•','…'] 
                  + ['i','we','one','two','1','2','3'])

In [6]:
def count_words( text, ignore_list ):
    word_freq = defaultdict(int)
    for word in word_tokenize(text):
        word_lc = word.lower();
        if word_lc not in ignore_list and word not in ignore_list:
            word_freq[ word_lc ] += 1
            #if word_freq[word_lc] > 2:
                #print('       word at {:3d}: {}'.format(word_freq[word_lc], word_lc) )
    return word_freq

In [7]:
def top_frequencies( n, freq_list, max_cut, min_cut ):
    freq_edit = dict( freq_list )
    max_freq  = float( max(freq_list.values()) )
    
    for word in freq_list.keys():
        word_freq = freq_list[word] / max_freq
        if word_freq >= max_cut or word_freq <= min_cut:
            del freq_edit[ word ]
    #print( freq_edit )
    return nlargest( n, freq_edit, key=freq_edit.get )

In [8]:
def crawlDailyMail( count, wordcount, ignore_list ):
    homepage      = 'http://www.dailymail.co.uk/home/index.html';
    home_content  = urlopen( homepage ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('/news/article-') }) ]
    unique_links  = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    print( "%d unique links found, using %d" % (unique_links.__len__(), count) )
    link_list     = list( unique_links )
    
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            article_content = urlopen( 'http://www.dailymail.co.uk' + article_href ).read().decode('utf8')
            article_soup    = BeautifulSoup( article_content, 'html.parser' )
            article_body    = article_soup.find_all('div', {'itemprop':'articleBody'})
            body_paragraphs = [ a.find_all('p', {'class':'mol-para-with-font'}, recursive=False) for a in article_body ]
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )
            
            article_words.append({ 'source':"daily mail", 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words
        
print("Crawling the DM")
dm_vectors = crawlDailyMail( article_count, word_count, ignore_list )
#print( dm_vectors )

Crawling the DM
158 unique links found, using 40
    1: /news/article-5287945/Dolores-ORiordan-coroner-awaiting-tests.html
    2: /news/article-5288709/Christian-whipped-selling-alcohol-Islamic-law.html
    3: /news/article-5290405/Trump-signs-bill-renewing-NSAs-internet-surveillance-program.html
    4: /news/article-5286245/New-Zealand-Prime-Minister-Jacinda-Ardern-pregnant.html
    5: /news/article-5288915/Ali-Raisman-Jordyn-Wieber-Larry-Nassar-hearing.html
    6: /news/article-5288273/Russian-veteran-builds-Iron-Man-suit-fight-bailiffs.html
    7: /news/article-5290247/Huge-tree-nearly-crushes-woman-pushing-pram.html
    8: /news/article-5288053/Jessica-Chastain-refusing-wear-Weinstein-wife-label.html
    9: /news/article-5288963/Moment-Emmanuel-Macron-meets-son-Mr-Bean-star-Atkinson.html
   10: /news/article-5288085/Photographer-uses-camouflage-make-art-stand-out.html
   11: /news/article-5282823/New-test-detect-STI-two-cent-Australians-have.html
   12: /news/article-5290909/Millio

In [9]:
def crawlGuardian( count, wordcount, ignore_list ):
    homepage      = 'https://www.theguardian.com/uk-news';
    home_content  = urlopen( homepage ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('/\d{4}/[a-z]{3}/\d{1,2}/') }) ]
    unique_links  = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    print( "%d unique links found, using %d" % (unique_links.__len__(), count) )
    link_list     = list( unique_links )

    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            display_href    = article_href.replace('https://www.theguardian.com/', '')
            article_content = urlopen( article_href ).read().decode('utf8')
            article_soup    = BeautifulSoup( article_content, 'html.parser' )
            article_body    = article_soup.find_all('div', {'itemprop':'articleBody'})
            body_paragraphs = [ a.find_all('p', recursive=False) for a in article_body ]
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )

            article_words.append({ 'source': 'guardian', 'words': tuple(top_words), 'href': display_href })
            processed += 1
            
        except:
            print('ERROR: ' + display_href)
            continue
        print( "{:5d}: {}".format(processed, display_href) )
        if processed >= count:
            break
            
    return article_words
    
print("Crawling the Guardian")
gd_vectors = crawlGuardian( article_count, word_count, ignore_list )
#print( gd_vectors )    

Crawling the Guardian
57 unique links found, using 40
    1: politics/2018/jan/13/nigel-farage-rattled-peers-brexit-struggle-now-beginning
    2: business/2018/jan/15/the-four-contracts-that-finished-carillion-public-private-partnership
    3: society/2018/jan/19/people-with-mental-illnesses-refused-access-to-insurance-cover
    4: law/2018/jan/20/judge-collapse-of-sex-trials-could-lead-to-rapists-going-free
    5: politics/2018/jan/19/theresa-may-accused-of-timidity-in-tory-mps-outburst
    6: business/2018/jan/11/does-london-mayor-sadiq-khans-brexit-report-stack-up
    7: politics/2018/jan/19/boris-johnson-channel-bridge-france-emmanuel-macron
    8: uk-news/2018/jan/17/man-convicted-of-theft-in-1976-cleared-after-googling-his-arresting-officer
    9: us-news/2018/jan/12/why-reading-too-much-into-trumps-cancelled-uk-trip-is-unwise
   10: uk-news/2018/jan/19/football-agent-peter-morrison-jailed-death-dangerous-driving
   11: politics/2018/jan/19/boris-johnson-slams-uks-infrastructure-

In [10]:
def crawlIndependent( count, wordcount, ignorelist ):
    homepage      = 'http://www.independent.co.uk/';
    home_content  = urlopen( homepage ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{5,}.html') }) ]
    unique_links  = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    print( "%d unique links found, using %d" % (unique_links.__len__(), count) )
    link_list     = list( unique_links )
    
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            article_content = urlopen( 'http://www.independent.co.uk' + article_href ).read().decode('utf8')
            article_soup    = BeautifulSoup( article_content, 'html.parser' )
            article_body    = article_soup.find_all('div', {'itemprop':'articleBody'})
            body_paragraphs = [ a.find_all('p', recursive=False) for a in article_body ]
            if not body_paragraphs:
                raise ValueError('No text to analyze')
            
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )

            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )

            article_words.append({ 'source': 'independent', 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words

print('Crawling The Independent')
id_vectors = crawlIndependent( article_count, word_count, ignore_list )
#print( id_vectors )

Crawling The Independent
54 unique links found, using 40
    1: /news/world/americas/us-politics/donald-trump-supreme-court-muslim-ban-chad-iran-libya-somalia-a8168906.html
    2: /news/world/americas/us-politics/trump-anti-abortion-rally-pro-life-address-first-sitting-president-a8168581.html
    3: /news/world/americas/us-politics/obama-cameron-bromance-narcissist-us-president-uk-pm-steve-hilton-fox-news-interview-a8148646.html
    4: /news/uk/politics/jeremy-corbyn-antisemitism-latest-speak-louder-labour-party-shadow-brexit-minister-sir-keir-starmer-a8168856.html
    5: /news/world/americas/carl-higbie-trump-official-resigns-muslim-shithole-rant-us-president-a8167896.html
    6: /news/world/americas/us-politics/james-comey-former-fbi-director-ethical-leadership-class-william-and-mary-college-williamsburg-a8168731.html
    7: /news/world/australasia/jacinda-ardern-pregnant-new-zealand-prime-minister-clarke-gayford-labour-auckland-a8167121.html
    8: /news/business/news/product-recall

In [11]:
def crawlExpress( count, wordcount, ignorelist ):
    homepage      = 'https://www.express.co.uk/news';
    home_content  = urlopen( homepage ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('^/news/[a-z]*/\d*/') }) ]
    unique_links  = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    print( "%d unique links found, using %d" % (unique_links.__len__(), count) )
    link_list     = list( unique_links )
    
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            article_content = urlopen( 'https://www.express.co.uk' + article_href ).read().decode('utf8')
            article_soup    = BeautifulSoup( article_content, 'html.parser' )
            article_body    = article_soup.find_all('section', {'class':'text-description'})
            body_paragraphs = [ a.find_all('p', recursive=False) for a in article_body ]
            if not body_paragraphs:
                raise ValueError('No text to analyze')
            
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )

            article_words.append({ 'source': 'express', 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words

print('Crawling The Express')
ex_vectors = crawlExpress( article_count, word_count, ignore_list )
#print( ex_vectors )

Crawling The Express
126 unique links found, using 40
    1: /news/uk/906995/argos-scam-sophisticated-banking-fraud-text-message-refund-unwanted-christmas-gifts
    2: /news/world/907174/Paradise-lockdown-shootings-state-of-emergency-Jamaica-gun-crime-Montego-Bay
    3: /news/world/907235/Brexit-latest-news-Theresa-May-Jean-Claude-Juncker-trade-deal-European-Union-Netherlands
    4: /news/world/906952/Italian-election-2018-March-4-Matteo-Salvini-League-party-Brussels-European-Union
    5: /news/uk/907254/Marriage-Britain-oldest-newlyweds-marry-aged-81-90-love-companionship
    6: /news/world/907217/andrej-babis-czech-government-immunity-eu-payments-fraud
    7: /news/weather/907187/UK-snow-forecast-will-it-snow-weekend-latest-snow-maps-Met-Office
    8: /news/uk/906996/Jacob-Rees-Mogg-Brexit-news-House-of-Lords-EU-withdrawal-bill
    9: /news/royal/907271/Queen-Mathilde-of-Belgium-pictures-45-birthday-King-Philippe
   10: /news/world/907313/russia-st-petersburg-fire-firefighters-police

In [None]:
class VectorSpace:
    
    def __init__( self, training_data, ignore_list ):
        self._training_data = training_data
        self._ignore_list   = ignore_list
    
    def _get_class_pc( self ):
        pass
    
    def test( self, text, wordcount, k ):
        neighbours = defaultdict(int)
        word_freq = count_words( text, self._ignore_list )
        test_set  = set( word_freq )
        
        for article in self._training_data:
            neighbours[ (article['source'], article['href']) ] = len( set(article['words']).intersection(test_set) )
        
        k_nearest = nlargest( k, neighbours, key=neighbours.get )
        classifications = defaultdict(int)
        for neighbour in k_nearest:
            classifications[ neighbour[0] ] += 1
        
        return [ (k, classifications[k]) for k in sorted(classifications, key=classifications.get, reverse=True) ]
        
training_data = dm_vectors + gd_vectors + id_vectors + ex_vectors
vs = VectorSpace( training_data, ignore_list )

In [None]:
# Let's test the BBC!
bbc_k = 5
count = 25 

print('Getting BBC links...')
bbc_home  = urlopen('http://www.bbc.com/news').read().decode('utf8');
bbc_soup  = BeautifulSoup(bbc_home, 'html.parser')
bbc_links = [ a['href'] for a in bbc_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{6,}$') }) ]

unique_links  = set([ re.sub('#(.*)$', '', href) for href in bbc_links ])
print( "%d unique links found, using %d\n" % (unique_links.__len__(), count) )
link_list     = list( unique_links )[:count]

for test_link in link_list:
    print('Testing ' + test_link )
    test_content = urlopen('http://www.bbc.com' + test_link).read().decode('utf8');
    test_soup    = BeautifulSoup( test_content, 'html.parser' )
    print( test_soup.title.text )

    test_body    = test_soup.find('div', {'property':'articleBody'})
    if not test_body:
        print('No test body found, continuing...\n')
        continue
    test_para    = test_body.find_all('p', recursive=False )
    if not test_para:
        print('No paragraphs found, continuing...\n')
        continue
    
    test_text   = ' '.join([ p.text for p in test_para ])
    test_result = vs.test(test_text, word_count, bbc_k) 
    print(", ".join([ '{}% {}'.format(int(c[1]/bbc_k*100), c[0]) for c in test_result ]))
    print('')
    


Getting BBC links...
36 unique links found, using 25

Testing /news/world-us-canada-42548824
Lac-Megantic: The runaway train that destroyed a town - BBC News
60% guardian, 20% daily mail, 20% independent

Testing /news/technology-42730916
Virgin's Hyperloop: Future or fantasy? - BBC News
40% independent, 20% daily mail, 20% guardian, 20% express

Testing /news/business-22434141
Entrepreneurship - BBC News
No test body found, continuing...

Testing /news/in-pictures-42746442
Week in pictures: 13 - 19 January 2018 - BBC News
60% daily mail, 40% guardian

Testing /news/world-us-canada-42712855
Women's march: Where, when and why will protests happen? - BBC News
No test body found, continuing...

Testing /news/technology-42746772
Data-stealing spyware 'traced to Lebanon' - BBC News
40% daily mail, 40% guardian, 20% independent

Testing /news/world-us-canada-42753639
Larry Nassar case: Olympic champ Jordyn Wieber reveals abuse - BBC News
80% daily mail, 20% independent

Testing /news/world-u