In [1]:
# This may get a bit hefty for Notepad, but we'll see.
# We'll control the counts from here:
article_count = 50
word_count    = 25
num_k         = 7

In [2]:
from urllib.request import urlopen
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from bs4 import BeautifulSoup
import re

In [3]:
ignore_list = set( stopwords.words('english')
                  + list(punctuation) 
                  + ['’',"'s","'it","'the","‘","'i","n't",'“','”','–','–','•','…','—'] 
                  + ['i','we','one','two','1','2','3'])

In [4]:
def extract_articles( link_prefix, link_list, extract_function, source_name, count ):
    processed = 0
    article_words = []
    for article_href in link_list:
        try:
            body_paragraphs = extract_function( link_prefix, article_href )
            if not body_paragraphs:
                raise ValueError('No body content found')
            article_blocks  = []
            for body in body_paragraphs:
                article_blocks.append( " ".join([ p.text.replace(u'\xa0', '') for p in body ]) )
            
            article_text = " ".join( article_blocks )
            word_freq = count_words( article_text, ignore_list )
            top_words = top_frequencies( word_count, word_freq, 0.9, 0.1 )
            
            article_words.append({ 'source':source_name, 'words': tuple(top_words), 'href': article_href })
            processed += 1
            
        except:
            #raise
            print('ERROR: ' + article_href)
            continue
        print( "{:5d}: {}".format(processed, article_href) )
        if processed >= count:
            break
            
    return article_words

In [5]:
def count_words( text, ignore_list ):
    word_freq = defaultdict(int)
    for word in word_tokenize(text):
        word_lc = word.lower();
        if word_lc not in ignore_list and word not in ignore_list:
            word_freq[ word_lc ] += 1
            #if word_freq[word_lc] > 2:
                #print('       word at {:3d}: {}'.format(word_freq[word_lc], word_lc) )
    return word_freq

In [6]:
def top_frequencies( n, freq_list, max_cut, min_cut ):
    freq_edit = dict( freq_list )
    max_freq  = float( max(freq_list.values()) )
    
    for word in freq_list.keys():
        word_freq = freq_list[word] / max_freq
        if word_freq >= max_cut or word_freq <= min_cut:
            del freq_edit[ word ]
    #print( freq_edit )
    return nlargest( n, freq_edit, key=freq_edit.get )

In [7]:
def process_links( article_links, count ):
    unique_links = set([ re.sub('#(.*)$', '', href) for href in article_links ])
    use_total    = min( count, unique_links.__len__() )
    print( "%d unique links found, trying to use using %d" % (unique_links.__len__(), use_total) )
    return list( unique_links )    

In [8]:
def soup_up( url ):
    home_content  = urlopen( url ).read().decode('utf8')
    home_soup     = BeautifulSoup( home_content, 'html.parser' )
    return home_soup

In [None]:
def crawlDailyMail( count, wordcount, ignore_list ):
    home_soup     = soup_up('http://www.dailymail.co.uk/home/index.html');
    article_links = [ a['href'] for a in home_soup.find_all('a', {'href': re.compile('/news/article-')}) ]
    link_list     = process_links( article_links, count )
    
    def extract_content( link_prefix, article_href ):
        article_soup = soup_up( link_prefix + article_href )
        article_body = article_soup.find_all('div', {'itemprop':'articleBody'})
        return [ a.find_all('p', {'class':'mol-para-with-font'}, recursive=False) for a in article_body ]
    
    return extract_articles(  'http://www.dailymail.co.uk', link_list, extract_content, 'daily mail', count )
        
print("Crawling the DM")
dm_vectors = crawlDailyMail( article_count, word_count, ignore_list )
#print( dm_vectors )

In [None]:
def crawlGuardian( count, wordcount, ignore_list ):
    home_soup     = soup_up('https://www.theguardian.com/uk-news');
    article_links = [ a['href'] for a in home_soup.find_all('a', { 'href': re.compile('/\d{4}/[a-z]{3}/\d{1,2}/') }) ]
    link_list     = process_links( article_links, count )
    
    link_list = list(map( lambda href: href.replace('https://www.theguardian.com', ''), link_list ))
    def extract_content( link_prefix, article_href ):
        article_soup = soup_up( link_prefix + article_href )
        article_body = article_soup.find_all('div', {'itemprop':'articleBody'})
        return [ a.find_all('p', recursive=False) for a in article_body ]
    
    return extract_articles( 'https://www.theguardian.com', link_list, extract_content, 'guardian', count )
    
print("Crawling the Guardian")
gd_vectors = crawlGuardian( article_count, word_count, ignore_list )
#print( gd_vectors )   

In [11]:
class MyBayesNaive():
    
    def __init__( self, training_data ):
        self._training_data = training_data
        self._calculate_totals()
        self._calculate_weights()
        
    def _calculate_totals( self ):
        self._totals  = defaultdict(int)
        self._classes = {}
        
        for vector in self._training_data:
            if not vector['source'] in self._classes:
                self._classes[ vector['source'] ] = defaultdict(int)
            
            for word in vector['words']:
                self._totals[ word ] += 1
                self._classes[ vector['source'] ][ word ] += 1
                
    def _calculate_weights( self ):
        self._weights = {}
        
        for _class in self._classes.keys():
            self._weights[ _class ] = { \
               word: float(i/self._totals[word]) \
               for word,i in self._classes[_class].items() }
            
    def test( self, text, word_count ):
        test_freqs  = count_words( text, ignore_list )
        test_vector = top_frequencies( word_count, test_freqs, 0.9, 0.1 )
        
        result = {}
        for _class in self._classes.keys():
            value = 1
            for word in test_vector:
                if word not in self._totals or word not in self._weights[ _class ]:
                    continue;
                value *= self._weights[ _class ][ word ]
            result[ _class ] = value
        
        return {source: result[source] for source in sorted( result, key=result.get, reverse=True ) }
        
training_data = dm_vectors + gd_vectors
bn = MyBayesNaive( training_data )

In [12]:
# Let's test the BBC!
bbc_k = 5
count = 3

print('Getting BBC links...')
bbc_soup  = soup_up('http://www.bbc.com/news')
bbc_links = [ a['href'] for a in bbc_soup.find_all('a', { 'href': re.compile('^/news/(.*)\d{6,}$') }) ]
link_list = process_links( bbc_links, count )

processed = 0;
for test_link in link_list:
    print('Testing ' + test_link )
    test_soup = soup_up('http://www.bbc.com' + test_link)
    print( test_soup.title.text )

    test_body    = test_soup.find('div', {'property':'articleBody'})
    if not test_body:
        count += 1
        print(' -- No test body found, continuing...\n')
        continue
    test_para    = test_body.find_all('p', recursive=False )
    if not test_para:
        print(' -- No paragraphs found, continuing...\n')
        continue
    
    test_text   = ' '.join([ p.text for p in test_para ])
    test_result = bn.test(test_text, word_count )

    print(', '.join(["{}: {:0.4f}".format( source, score * 10000 ) for source, score in test_result.items() ]))
    print('')
    
    processed += 1
    if( processed >= count ):
        break
    

Getting BBC links...
37 unique links found, trying to use using 3
Testing /news/uk-42757023
British tourists warned to 'stay in resorts' in Jamaica security emergency - BBC News
guardian: 38.9151, daily mail: 0.7311

Testing /news/entertainment-arts-42746617
Pad Man: A man's 'period poverty' rescue becomes a film - BBC News
 -- No test body found, continuing...

Testing /news/world-us-canada-42712855
Women's March: Where, when and why will protests happen? - BBC News
 -- No test body found, continuing...

Testing /news/world-europe-42749953
France: Emmanuel Macron’s marvellous manoeuvres this week - BBC News
daily mail: 27.0108, guardian: 7.7174

Testing /news/world-europe-42750584
Why Italians are saying 'No' to takeaway coffee - BBC News
 -- No test body found, continuing...

Testing /news/world-us-canada-42751715
How can parents torture their children? - BBC News
daily mail: 2.1439, guardian: 0.0988

Testing /news/world-us-canada-39732845
What has President Trump said about your cou