In [51]:
# So, we want to be able to classify articles into tech and non-tech using K-NN.
# How will we go about this? (I think I might go for Daily Mail / Guardian instead)

# Additional: If anyone else is reading this, apologies - it could be neater. I wasn't
# expecting to have quite so much fun and go so far with it, hence why it's now 2:20am.

In [52]:
# We'll attempt the following steps:
# 1. Download a relevant corpus - pick a new website and extract two sets of article: tech and sport.
# 2. Represent each article as a vector of the 25 most important words in an article.
# 3. The distance between articles is calculated using the number of words that they have in common
# 4. Find the K-Nearest Neighbours and carry out a majority vote.

In [53]:
# This may get a bit hefty for Notepad, but we'll see.
# We'll control the counts from here:
article_count = 40
word_count    = 25
num_k         = 7

In [54]:
from urllib.request import urlopen
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from bs4 import BeautifulSoup
import re

In [55]:
ignore_list = set( stopwords.words('english')
                  + list(punctuation) 
                  + ['’',"'s","'it","'the","‘","'i","n't",'“','”','–','–','•','…'] 
                  + ['i','we','one','two','1','2','3'])

In [56]:
def count_words( text, ignore_list ):
    word_freq = defaultdict(int)
    for word in word_tokenize(text):
        word_lc = word.lower();
        if word_lc not in ignore_list and word not in ignore_list:
            word_freq[ word_lc ] += 1
            #if word_freq[word_lc] > 2:
                #print('       word at {:3d}: {}'.format(word_freq[word_lc], word_lc) )
    return word_freq

In [57]:
def top_frequencies( n, freq_list, max_cut, min_cut ):
    freq_edit = dict( freq_list )
    max_freq  = float( max(freq_list.values()) )
    
    for word in freq_list.keys():
        word_freq = freq_list[word] / max_freq
        if word_freq >= max_cut or word_freq <= min_cut:
            del freq_edit[ word ]
    #print( freq_edit )
    return nlargest( n, freq_edit, key=freq_edit.get )

In [58]:
def process_links( article_links, count ):
    unique_links  = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    print( "%d unique links found, using %d" % (unique_links.__len__(), count) )
    return list( unique_links )    

In [59]:
def soup_up( url ):
    home_content  = urlopen( url ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    return home_soup

In [60]:
def crawlDailyMail( count, wordcount, ignore_list ):
    home_soup     = soup_up('http://www.dailymail.co.uk/home/index.html');
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('/news/article-') }) ]
    link_list     = process_links( article_links, count )
    
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            article_soup    = soup_up( 'http://www.dailymail.co.uk' + article_href )
            article_body    = article_soup.find_all('div', {'itemprop':'articleBody'})
            body_paragraphs = [ a.find_all('p', {'class':'mol-para-with-font'}, recursive=False) for a in article_body ]
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )
            
            article_words.append({ 'source':"daily mail", 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words
        
print("Crawling the DM")
dm_vectors = crawlDailyMail( article_count, word_count, ignore_list )
#print( dm_vectors )

Crawling the DM
159 unique links found, using 40
    1: /news/article-5290017/Kates-naughty-Uncle-Gary-Goldsmith-boozy-night-out.html
    2: /news/article-5283401/Weinsteins-concern-did-not-did.html
    3: /news/article-5290283/Theresa-Donald-Trump-meet-talks.html
    4: /news/article-5287931/Woman-83-killed-Knightsbridge-road-traffic-accident.html
    5: /news/article-5287379/Failed-ski-resort-looms-Pyeongchang-Games-legacy.html
    6: /news/article-5289871/Parents-lost-baby-hospital-blunder-slam-staff.html
    7: /news/article-5290933/Drunk-British-Airways-pilot-hauled-Gatwick-flight.html
    8: /news/article-5285367/Meghan-says-hen-Cardiff-fun.html
    9: /news/article-5288409/Jewellery-designer-pleads-gangster-ring-return.html
   10: /news/article-5286805/Suspect-death-riddle-written-victim.html
   11: /news/article-5289275/Ben-Bradley-said-cops-play-splat-chav-riots.html
   12: /news/article-5287053/Jessica-Falkholts-funeral-held-church-family.html
   13: /news/article-5289527/Dav

In [61]:
def crawlGuardian( count, wordcount, ignore_list ):
    home_soup     = soup_up('https://www.theguardian.com/uk-news');
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('/\d{4}/[a-z]{3}/\d{1,2}/') }) ]
    link_list     = process_links( article_links, count )

    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            display_href    = article_href.replace('https://www.theguardian.com/', '')
            article_soup    = soup_up( article_href )
            article_body    = article_soup.find_all('div', {'itemprop':'articleBody'})
            body_paragraphs = [ a.find_all('p', recursive=False) for a in article_body ]
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )

            article_words.append({ 'source': 'guardian', 'words': tuple(top_words), 'href': display_href })
            processed += 1
            
        except:
            print('ERROR: ' + display_href)
            continue
        print( "{:5d}: {}".format(processed, display_href) )
        if processed >= count:
            break
            
    return article_words
    
print("Crawling the Guardian")
gd_vectors = crawlGuardian( article_count, word_count, ignore_list )
#print( gd_vectors )    

Crawling the Guardian
57 unique links found, using 40
ERROR: /politics/ng-interactive/2017/jun/08/live-uk-election-results-in-full-2017
ERROR: weather/video/2018/jan/18/fierce-winds-fell-trees-and-cause-travel-chaos-across-the-uk-video
    1: society/2018/jan/19/cynthia-cownie-obituary
ERROR: uk-news/video/2018/jan/15/judgment-day-made-in-stoke-on-trent
    2: society/2018/jan/19/channel-4-calls-in-security-experts-after-cathy-newman-suffers-online-abuse
    3: uk-news/2018/jan/17/man-convicted-of-theft-in-1976-cleared-after-googling-his-arresting-officer
    4: politics/2018/jan/13/private-public-sector-history-tangled-relationship
    5: artanddesign/2018/jan/19/banksy-painting-saved-from-derelict-container-on-dungeness-beach
ERROR: uk-news/video/2018/jan/18/meghan-and-harry-greeted-by-excited-crowds-at-cardiff-castle-video
    6: politics/2018/jan/19/boris-johnson-channel-bridge-france-emmanuel-macron
    7: uk-news/2018/jan/17/pakistani-humanist-denied-uk-asylum-after-failing-to-id

In [66]:
def crawlIndependent( count, wordcount, ignorelist ):
    home_soup     = soup_up('http://www.independent.co.uk/');
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{5,}.html') }) ]
    link_list     = process_links( article_links, count )
   
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            article_soup    = soup_up( 'http://www.independent.co.uk' + article_href)
            article_body    = article_soup.find_all('div', {'itemprop':'articleBody'})
            body_paragraphs = [ a.find_all('p', recursive=False) for a in article_body ]
            if not body_paragraphs:
                raise ValueError('No text to analyze')
            
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )

            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )

            article_words.append({ 'source': 'independent', 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words

print('Crawling The Independent')
id_vectors = crawlIndependent( article_count, word_count, ignore_list )
#print( id_vectors )

Crawling The Independent
54 unique links found, using 40
    1: /news/uk/politics/article-49-what-is-it-brexit-uk-leave-eu-how-reverse-50-lisbon-treaty-a8164061.html
    2: /news/world/americas/children-kill-half-million-bees-vandalise-farm-charged-police-iowa-sioux-city-wild-hill-honey-a8167846.html
    3: /news/world/americas/carl-higbie-trump-official-resigns-muslim-shithole-rant-us-president-a8167896.html
    4: /news/world/americas/us-politics/nigel-farage-julian-assange-secret-meetings-us-congress-evidence-a8168506.html
    5: /news/uk/crime/john-worboys-latest-updates-prison-release-black-cab-rapists-judicial-review-legal-challenge-victims-a8168656.html
    6: /news/uk/politics/boris-johnson-bridge-english-channel-eurotunnel-wants-to-be-involved-go-ahead-a8169116.html
    7: /news/world/americas/us-politics/trump-theresa-may-meeting-davos-next-week-world-economic-forum-switzerland-a8169006.html
    8: /news/world/americas/donald-trump-first-year-anniversary-russia-probe-mueller-

In [67]:
def crawlExpress( count, wordcount, ignorelist ):
    home_soup     = soup_up('https://www.express.co.uk/news' );
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('^/news/[a-z]*/\d*/') }) ]
    link_list     = process_links( article_links, count )
    
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            article_soup    = soup_up( 'https://www.express.co.uk' + article_href )
            article_body    = article_soup.find_all('section', {'class':'text-description'})
            body_paragraphs = [ a.find_all('p', recursive=False) for a in article_body ]
            if not body_paragraphs:
                raise ValueError('No text to analyze')
            
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( wordcount, word_freq, 0.9, 0.1 )

            article_words.append({ 'source': 'express', 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words

print('Crawling The Express')
ex_vectors = crawlExpress( article_count, word_count, ignore_list )
#print( ex_vectors )

Crawling The Express
126 unique links found, using 40
    1: /news/uk/906948/Brexit-news-European-Union-EU-UK-latest-deal-referendum-Macron-BBC
    2: /news/weird/906589/Oblong-UFOs-high-definition-NASA-satellite-camera
    3: /news/royal/907139/Buckingham-Palace-Queen-Elizabeth-George-VI-cigarettes-newspaper-history-Royals
    4: /news/politics/907033/brexit-news-eu-uk-city-of-london-financial-centre-brexit-trade-deal-mcguinness-macron
    5: /news/world/907104/north-korea-news-japan-threat-kim-jong-un-abducted-Kazuhiro-Araki-world-war-3-ICC
    6: /news/politics/906957/theresa-may-emmanuel-macron-selfie-oscar-calais-deal
    7: /news/royal/907158/Meghan-Markle-royal-wedding-Prince-Harry-Amy-Pickerill
    8: /news/royal/906849/Prince-William-haircut-bald-baldness-Prince-Harry-Duchess-of-Cambridge-Richard-Ward
    9: /news/uk/907049/WWII-Veteran-calls-on-fellow-survivors-to-join-fight-for-bomber-boys-to-be-given-medals
   10: /news/nature/907154/Orangutan-returns-jungle-eight-years-sch

In [68]:
class VectorSpace:
    
    def __init__( self, training_data, ignore_list ):
        self._training_data = training_data
        self._ignore_list   = ignore_list
    
    def _get_class_pc( self ):
        pass
    
    def test( self, text, wordcount, k ):
        neighbours = defaultdict(int)
        word_freq = count_words( text, self._ignore_list )
        test_set  = set( word_freq )
        
        for article in self._training_data:
            neighbours[ (article['source'], article['href']) ] = len( set(article['words']).intersection(test_set) )
        
        k_nearest = nlargest( k, neighbours, key=neighbours.get )
        classifications = defaultdict(int)
        for neighbour in k_nearest:
            classifications[ neighbour[0] ] += 1
        
        return [ (k, classifications[k]) for k in sorted(classifications, key=classifications.get, reverse=True) ]
        
training_data = dm_vectors + gd_vectors + id_vectors + ex_vectors
vs = VectorSpace( training_data, ignore_list )

In [69]:
# Let's test the BBC!
bbc_k = 5
count = 25 

print('Getting BBC links...')
bbc_soup  = soup_up('http://www.bbc.com/news')
bbc_links = [ a['href'] for a in bbc_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{6,}$') }) ]
link_list = process_links( bbc_links, count )[:count]

for test_link in link_list:
    print('Testing ' + test_link )
    test_soup = soup_up('http://www.bbc.com' + test_link)
    print( test_soup.title.text )

    test_body    = test_soup.find('div', {'property':'articleBody'})
    if not test_body:
        print('No test body found, continuing...\n')
        continue
    test_para    = test_body.find_all('p', recursive=False )
    if not test_para:
        print('No paragraphs found, continuing...\n')
        continue
    
    test_text   = ' '.join([ p.text for p in test_para ])
    test_result = vs.test(test_text, word_count, bbc_k) 
    print(", ".join([ '{}% {}'.format(int(c[1]/bbc_k*100), c[0]) for c in test_result ]))
    print('')
    


Getting BBC links...
41 unique links found, using 25
Testing /news/world-us-canada-42753639
Larry Nassar case: Olympic champ Jordyn Wieber reveals abuse - BBC News
60% daily mail, 20% independent, 20% express

Testing /news/world-us-canada-42754607
Nassar case: Gold medallists Raisman and Wieber face abuser - BBC News
No test body found, continuing...

Testing /news/world-us-canada-42753400
US faces growing threat from revisionist powers - Mattis - BBC News
No test body found, continuing...

Testing /news/world-us-canada-39732845
What has President Trump said about your country? - BBC News
60% daily mail, 20% guardian, 20% independent

Testing /news/world-europe-42750584
Why Italians are saying 'no' to takeaway coffee - BBC News
No test body found, continuing...

Testing /news/world-asia-42729173
New Zealand debates access to dead sea life footage - BBC News
40% independent, 40% daily mail, 20% express

Testing /news/business-22434141
Entrepreneurship - BBC News
No test body found, con